1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12
13#include "vpx_config.h"
14#include "vp8_rtcd.h"
15#include "encodemb.h"
16#include "vp8/common/reconinter.h"
17#include "vp8/encoder/quantize.h"
18#include "tokenize.h"
19#include "vp8/common/invtrans.h"
20#include "vpx_mem/vpx_mem.h"
21#include "rdopt.h"
22
23void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
24  unsigned char *src_ptr = (*(be->base_src) + be->src);
25  short *diff_ptr = be->src_diff;
26  unsigned char *pred_ptr = bd->predictor;
27  int src_stride = be->src_stride;
28
29  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, pred_ptr,
30                     pitch);
31}
32
33void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
34                       int src_stride, unsigned char *upred,
35                       unsigned char *vpred, int pred_stride) {
36  short *udiff = diff + 256;
37  short *vdiff = diff + 320;
38
39  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
40  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
41}
42
43void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
44                      unsigned char *pred, int pred_stride) {
45  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
46}
47
48static void vp8_subtract_mb(MACROBLOCK *x) {
49  BLOCK *b = &x->block[0];
50
51  vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
52                   x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
53  vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
54                    x->src.uv_stride, x->e_mbd.dst.u_buffer,
55                    x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
56}
57
58static void build_dcblock(MACROBLOCK *x) {
59  short *src_diff_ptr = &x->src_diff[384];
60  int i;
61
62  for (i = 0; i < 16; ++i) {
63    src_diff_ptr[i] = x->coeff[i * 16];
64  }
65}
66
67void vp8_transform_mbuv(MACROBLOCK *x) {
68  int i;
69
70  for (i = 16; i < 24; i += 2) {
71    x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
72  }
73}
74
75void vp8_transform_intra_mby(MACROBLOCK *x) {
76  int i;
77
78  for (i = 0; i < 16; i += 2) {
79    x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
80  }
81
82  /* build dc block from 16 y dc values */
83  build_dcblock(x);
84
85  /* do 2nd order transform on the dc block */
86  x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
87}
88
89static void transform_mb(MACROBLOCK *x) {
90  int i;
91
92  for (i = 0; i < 16; i += 2) {
93    x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
94  }
95
96  /* build dc block from 16 y dc values */
97  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) build_dcblock(x);
98
99  for (i = 16; i < 24; i += 2) {
100    x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
101  }
102
103  /* do 2nd order transform on the dc block */
104  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
105    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
106  }
107}
108
109static void transform_mby(MACROBLOCK *x) {
110  int i;
111
112  for (i = 0; i < 16; i += 2) {
113    x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
114  }
115
116  /* build dc block from 16 y dc values */
117  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
118    build_dcblock(x);
119    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
120  }
121}
122
123#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
124
125typedef struct vp8_token_state vp8_token_state;
126
127struct vp8_token_state {
128  int rate;
129  int error;
130  signed char next;
131  signed char token;
132  short qc;
133};
134
135/* TODO: experiments to find optimal multiple numbers */
136#define Y1_RD_MULT 4
137#define UV_RD_MULT 2
138#define Y2_RD_MULT 16
139
140static const int plane_rd_mult[4] = { Y1_RD_MULT, Y2_RD_MULT, UV_RD_MULT,
141                                      Y1_RD_MULT };
142
143static void optimize_b(MACROBLOCK *mb, int ib, int type, ENTROPY_CONTEXT *a,
144                       ENTROPY_CONTEXT *l) {
145  BLOCK *b;
146  BLOCKD *d;
147  vp8_token_state tokens[17][2];
148  unsigned best_mask[2];
149  const short *dequant_ptr;
150  const short *coeff_ptr;
151  short *qcoeff_ptr;
152  short *dqcoeff_ptr;
153  int eob;
154  int i0;
155  int rc;
156  int x;
157  int sz = 0;
158  int next;
159  int rdmult;
160  int rddiv;
161  int final_eob;
162  int rd_cost0;
163  int rd_cost1;
164  int rate0;
165  int rate1;
166  int error0;
167  int error1;
168  int t0;
169  int t1;
170  int best;
171  int band;
172  int pt;
173  int i;
174  int err_mult = plane_rd_mult[type];
175
176  b = &mb->block[ib];
177  d = &mb->e_mbd.block[ib];
178
179  dequant_ptr = d->dequant;
180  coeff_ptr = b->coeff;
181  qcoeff_ptr = d->qcoeff;
182  dqcoeff_ptr = d->dqcoeff;
183  i0 = !type;
184  eob = *d->eob;
185
186  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
187  rdmult = mb->rdmult * err_mult;
188  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
189    rdmult = (rdmult * 9) >> 4;
190  }
191
192  rddiv = mb->rddiv;
193  best_mask[0] = best_mask[1] = 0;
194  /* Initialize the sentinel node of the trellis. */
195  tokens[eob][0].rate = 0;
196  tokens[eob][0].error = 0;
197  tokens[eob][0].next = 16;
198  tokens[eob][0].token = DCT_EOB_TOKEN;
199  tokens[eob][0].qc = 0;
200  *(tokens[eob] + 1) = *(tokens[eob] + 0);
201  next = eob;
202  for (i = eob; i-- > i0;) {
203    int base_bits;
204    int d2;
205    int dx;
206
207    rc = vp8_default_zig_zag1d[i];
208    x = qcoeff_ptr[rc];
209    /* Only add a trellis state for non-zero coefficients. */
210    if (x) {
211      int shortcut = 0;
212      error0 = tokens[next][0].error;
213      error1 = tokens[next][1].error;
214      /* Evaluate the first possibility for this state. */
215      rate0 = tokens[next][0].rate;
216      rate1 = tokens[next][1].rate;
217      t0 = (vp8_dct_value_tokens_ptr + x)->Token;
218      /* Consider both possible successor states. */
219      if (next < 16) {
220        band = vp8_coef_bands[i + 1];
221        pt = vp8_prev_token_class[t0];
222        rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
223        rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
224      }
225      rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
226      rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
227      if (rd_cost0 == rd_cost1) {
228        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
229        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
230      }
231      /* And pick the best. */
232      best = rd_cost1 < rd_cost0;
233      base_bits = *(vp8_dct_value_cost_ptr + x);
234      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
235      d2 = dx * dx;
236      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
237      tokens[i][0].error = d2 + (best ? error1 : error0);
238      tokens[i][0].next = next;
239      tokens[i][0].token = t0;
240      tokens[i][0].qc = x;
241      best_mask[0] |= best << i;
242      /* Evaluate the second possibility for this state. */
243      rate0 = tokens[next][0].rate;
244      rate1 = tokens[next][1].rate;
245
246      if ((abs(x) * dequant_ptr[rc] > abs(coeff_ptr[rc])) &&
247          (abs(x) * dequant_ptr[rc] < abs(coeff_ptr[rc]) + dequant_ptr[rc])) {
248        shortcut = 1;
249      } else {
250        shortcut = 0;
251      }
252
253      if (shortcut) {
254        sz = -(x < 0);
255        x -= 2 * sz + 1;
256      }
257
258      /* Consider both possible successor states. */
259      if (!x) {
260        /* If we reduced this coefficient to zero, check to see if
261         *  we need to move the EOB back here.
262         */
263        t0 =
264            tokens[next][0].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
265        t1 =
266            tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
267      } else {
268        t0 = t1 = (vp8_dct_value_tokens_ptr + x)->Token;
269      }
270      if (next < 16) {
271        band = vp8_coef_bands[i + 1];
272        if (t0 != DCT_EOB_TOKEN) {
273          pt = vp8_prev_token_class[t0];
274          rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
275        }
276        if (t1 != DCT_EOB_TOKEN) {
277          pt = vp8_prev_token_class[t1];
278          rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
279        }
280      }
281
282      rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
283      rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
284      if (rd_cost0 == rd_cost1) {
285        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
286        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
287      }
288      /* And pick the best. */
289      best = rd_cost1 < rd_cost0;
290      base_bits = *(vp8_dct_value_cost_ptr + x);
291
292      if (shortcut) {
293        dx -= (dequant_ptr[rc] + sz) ^ sz;
294        d2 = dx * dx;
295      }
296      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
297      tokens[i][1].error = d2 + (best ? error1 : error0);
298      tokens[i][1].next = next;
299      tokens[i][1].token = best ? t1 : t0;
300      tokens[i][1].qc = x;
301      best_mask[1] |= best << i;
302      /* Finally, make this the new head of the trellis. */
303      next = i;
304    }
305    /* There's no choice to make for a zero coefficient, so we don't
306     *  add a new trellis node, but we do need to update the costs.
307     */
308    else {
309      band = vp8_coef_bands[i + 1];
310      t0 = tokens[next][0].token;
311      t1 = tokens[next][1].token;
312      /* Update the cost of each path if we're past the EOB token. */
313      if (t0 != DCT_EOB_TOKEN) {
314        tokens[next][0].rate += mb->token_costs[type][band][0][t0];
315        tokens[next][0].token = ZERO_TOKEN;
316      }
317      if (t1 != DCT_EOB_TOKEN) {
318        tokens[next][1].rate += mb->token_costs[type][band][0][t1];
319        tokens[next][1].token = ZERO_TOKEN;
320      }
321      /* Don't update next, because we didn't add a new node. */
322    }
323  }
324
325  /* Now pick the best path through the whole trellis. */
326  band = vp8_coef_bands[i + 1];
327  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
328  rate0 = tokens[next][0].rate;
329  rate1 = tokens[next][1].rate;
330  error0 = tokens[next][0].error;
331  error1 = tokens[next][1].error;
332  t0 = tokens[next][0].token;
333  t1 = tokens[next][1].token;
334  rate0 += mb->token_costs[type][band][pt][t0];
335  rate1 += mb->token_costs[type][band][pt][t1];
336  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
337  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
338  if (rd_cost0 == rd_cost1) {
339    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
340    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
341  }
342  best = rd_cost1 < rd_cost0;
343  final_eob = i0 - 1;
344  for (i = next; i < eob; i = next) {
345    x = tokens[i][best].qc;
346    if (x) final_eob = i;
347    rc = vp8_default_zig_zag1d[i];
348    qcoeff_ptr[rc] = x;
349    dqcoeff_ptr[rc] = x * dequant_ptr[rc];
350    next = tokens[i][best].next;
351    best = (best_mask[best] >> i) & 1;
352  }
353  final_eob++;
354
355  *a = *l = (final_eob != !type);
356  *d->eob = (char)final_eob;
357}
358static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type, ENTROPY_CONTEXT *a,
359                                   ENTROPY_CONTEXT *l) {
360  int sum = 0;
361  int i;
362  BLOCKD *bd = &x->block[24];
363
364  if (bd->dequant[0] >= 35 && bd->dequant[1] >= 35) return;
365
366  for (i = 0; i < (*bd->eob); ++i) {
367    int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
368    sum += (coef >= 0) ? coef : -coef;
369    if (sum >= 35) return;
370  }
371  /**************************************************************************
372  our inverse hadamard transform effectively is weighted sum of all 16 inputs
373  with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
374  dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
375  output after inverse wht and idct will be all zero. A sum of absolute value
376  smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
377  fall between -35 and +35.
378  **************************************************************************/
379  if (sum < 35) {
380    for (i = 0; i < (*bd->eob); ++i) {
381      int rc = vp8_default_zig_zag1d[i];
382      bd->qcoeff[rc] = 0;
383      bd->dqcoeff[rc] = 0;
384    }
385    *bd->eob = 0;
386    *a = *l = (*bd->eob != !type);
387  }
388}
389
390static void optimize_mb(MACROBLOCK *x) {
391  int b;
392  int type;
393  int has_2nd_order;
394
395  ENTROPY_CONTEXT_PLANES t_above, t_left;
396  ENTROPY_CONTEXT *ta;
397  ENTROPY_CONTEXT *tl;
398
399  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
400  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
401
402  ta = (ENTROPY_CONTEXT *)&t_above;
403  tl = (ENTROPY_CONTEXT *)&t_left;
404
405  has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
406                   x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
407  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
408
409  for (b = 0; b < 16; ++b) {
410    optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
411  }
412
413  for (b = 16; b < 24; ++b) {
414    optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
415               tl + vp8_block2left[b]);
416  }
417
418  if (has_2nd_order) {
419    b = 24;
420    optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
421               tl + vp8_block2left[b]);
422    check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
423                           tl + vp8_block2left[b]);
424  }
425}
426
427void vp8_optimize_mby(MACROBLOCK *x) {
428  int b;
429  int type;
430  int has_2nd_order;
431
432  ENTROPY_CONTEXT_PLANES t_above, t_left;
433  ENTROPY_CONTEXT *ta;
434  ENTROPY_CONTEXT *tl;
435
436  if (!x->e_mbd.above_context) return;
437
438  if (!x->e_mbd.left_context) return;
439
440  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
441  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
442
443  ta = (ENTROPY_CONTEXT *)&t_above;
444  tl = (ENTROPY_CONTEXT *)&t_left;
445
446  has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
447                   x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
448  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
449
450  for (b = 0; b < 16; ++b) {
451    optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
452  }
453
454  if (has_2nd_order) {
455    b = 24;
456    optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
457               tl + vp8_block2left[b]);
458    check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
459                           tl + vp8_block2left[b]);
460  }
461}
462
463void vp8_optimize_mbuv(MACROBLOCK *x) {
464  int b;
465  ENTROPY_CONTEXT_PLANES t_above, t_left;
466  ENTROPY_CONTEXT *ta;
467  ENTROPY_CONTEXT *tl;
468
469  if (!x->e_mbd.above_context) return;
470
471  if (!x->e_mbd.left_context) return;
472
473  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
474  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
475
476  ta = (ENTROPY_CONTEXT *)&t_above;
477  tl = (ENTROPY_CONTEXT *)&t_left;
478
479  for (b = 16; b < 24; ++b) {
480    optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
481               tl + vp8_block2left[b]);
482  }
483}
484
485void vp8_encode_inter16x16(MACROBLOCK *x) {
486  vp8_build_inter_predictors_mb(&x->e_mbd);
487
488  vp8_subtract_mb(x);
489
490  transform_mb(x);
491
492  vp8_quantize_mb(x);
493
494  if (x->optimize) optimize_mb(x);
495}
496
497/* this funciton is used by first pass only */
498void vp8_encode_inter16x16y(MACROBLOCK *x) {
499  BLOCK *b = &x->block[0];
500
501  vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
502                                      x->e_mbd.dst.y_stride);
503
504  vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
505                   x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
506
507  transform_mby(x);
508
509  vp8_quantize_mby(x);
510
511  vp8_inverse_transform_mby(&x->e_mbd);
512}
513