1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <math.h>
13#include <stdio.h>
14
15#include "./vp9_rtcd.h"
16
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_mem/vpx_mem.h"
19#include "vpx_ports/bitops.h"
20#include "vpx_ports/mem.h"
21#include "vpx_ports/system_state.h"
22
23#include "vp9/common/vp9_common.h"
24#include "vp9/common/vp9_entropy.h"
25#include "vp9/common/vp9_entropymode.h"
26#include "vp9/common/vp9_mvref_common.h"
27#include "vp9/common/vp9_pred_common.h"
28#include "vp9/common/vp9_quant_common.h"
29#include "vp9/common/vp9_reconinter.h"
30#include "vp9/common/vp9_reconintra.h"
31#include "vp9/common/vp9_seg_common.h"
32
33#include "vp9/encoder/vp9_cost.h"
34#include "vp9/encoder/vp9_encodemb.h"
35#include "vp9/encoder/vp9_encodemv.h"
36#include "vp9/encoder/vp9_encoder.h"
37#include "vp9/encoder/vp9_mcomp.h"
38#include "vp9/encoder/vp9_quantize.h"
39#include "vp9/encoder/vp9_ratectrl.h"
40#include "vp9/encoder/vp9_rd.h"
41#include "vp9/encoder/vp9_tokenize.h"
42
43#define RD_THRESH_POW 1.25
44
45// Factor to weigh the rate for switchable interp filters.
46#define SWITCHABLE_INTERP_RATE_FACTOR 1
47
48void vp9_rd_cost_reset(RD_COST *rd_cost) {
49  rd_cost->rate = INT_MAX;
50  rd_cost->dist = INT64_MAX;
51  rd_cost->rdcost = INT64_MAX;
52}
53
54void vp9_rd_cost_init(RD_COST *rd_cost) {
55  rd_cost->rate = 0;
56  rd_cost->dist = 0;
57  rd_cost->rdcost = 0;
58}
59
60// The baseline rd thresholds for breaking out of the rd loop for
61// certain modes are assumed to be based on 8x8 blocks.
62// This table is used to correct for block size.
63// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
64static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
65  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
66};
67
68static void fill_mode_costs(VP9_COMP *cpi) {
69  const FRAME_CONTEXT *const fc = cpi->common.fc;
70  int i, j;
71
72  for (i = 0; i < INTRA_MODES; ++i)
73    for (j = 0; j < INTRA_MODES; ++j)
74      vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
75                      vp9_intra_mode_tree);
76
77  vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
78  for (i = 0; i < INTRA_MODES; ++i) {
79    vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
80                    vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
81    vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
82                    fc->uv_mode_prob[i], vp9_intra_mode_tree);
83  }
84
85  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
86    vp9_cost_tokens(cpi->switchable_interp_costs[i],
87                    fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
88}
89
90static void fill_token_costs(vp9_coeff_cost *c,
91                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
92  int i, j, k, l;
93  TX_SIZE t;
94  for (t = TX_4X4; t <= TX_32X32; ++t)
95    for (i = 0; i < PLANE_TYPES; ++i)
96      for (j = 0; j < REF_TYPES; ++j)
97        for (k = 0; k < COEF_BANDS; ++k)
98          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
99            vpx_prob probs[ENTROPY_NODES];
100            vp9_model_to_full_probs(p[t][i][j][k][l], probs);
101            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, vp9_coef_tree);
102            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
103                                 vp9_coef_tree);
104            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
105                   c[t][i][j][k][1][l][EOB_TOKEN]);
106          }
107}
108
109// Values are now correlated to quantizer.
110static int sad_per_bit16lut_8[QINDEX_RANGE];
111static int sad_per_bit4lut_8[QINDEX_RANGE];
112
113#if CONFIG_VP9_HIGHBITDEPTH
114static int sad_per_bit16lut_10[QINDEX_RANGE];
115static int sad_per_bit4lut_10[QINDEX_RANGE];
116static int sad_per_bit16lut_12[QINDEX_RANGE];
117static int sad_per_bit4lut_12[QINDEX_RANGE];
118#endif
119
120static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
121                            vpx_bit_depth_t bit_depth) {
122  int i;
123  // Initialize the sad lut tables using a formulaic calculation for now.
124  // This is to make it easier to resolve the impact of experimental changes
125  // to the quantizer tables.
126  for (i = 0; i < range; i++) {
127    const double q = vp9_convert_qindex_to_q(i, bit_depth);
128    bit16lut[i] = (int)(0.0418 * q + 2.4107);
129    bit4lut[i] = (int)(0.063 * q + 2.742);
130  }
131}
132
133void vp9_init_me_luts(void) {
134  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
135                  VPX_BITS_8);
136#if CONFIG_VP9_HIGHBITDEPTH
137  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
138                  VPX_BITS_10);
139  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
140                  VPX_BITS_12);
141#endif
142}
143
144static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
145                                         8,  8,  4,  4,  2,  2,  1,  0 };
146static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
147                                                              128, 144 };
148
149int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
150  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
151#if CONFIG_VP9_HIGHBITDEPTH
152  int64_t rdmult = 0;
153  switch (cpi->common.bit_depth) {
154    case VPX_BITS_8: rdmult = 88 * q * q / 24; break;
155    case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
156    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
157    default:
158      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
159      return -1;
160  }
161#else
162  int64_t rdmult = 88 * q * q / 24;
163#endif  // CONFIG_VP9_HIGHBITDEPTH
164  return rdmult;
165}
166
167int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
168  int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
169
170  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
171    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
172    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
173    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
174
175    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
176    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
177  }
178  if (rdmult < 1) rdmult = 1;
179  return (int)rdmult;
180}
181
182static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
183  double q;
184#if CONFIG_VP9_HIGHBITDEPTH
185  switch (bit_depth) {
186    case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
187    case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
188    case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break;
189    default:
190      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
191      return -1;
192  }
193#else
194  (void)bit_depth;
195  q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
196#endif  // CONFIG_VP9_HIGHBITDEPTH
197  // TODO(debargha): Adjust the function below.
198  return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
199}
200
201void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
202#if CONFIG_VP9_HIGHBITDEPTH
203  switch (cpi->common.bit_depth) {
204    case VPX_BITS_8:
205      x->sadperbit16 = sad_per_bit16lut_8[qindex];
206      x->sadperbit4 = sad_per_bit4lut_8[qindex];
207      break;
208    case VPX_BITS_10:
209      x->sadperbit16 = sad_per_bit16lut_10[qindex];
210      x->sadperbit4 = sad_per_bit4lut_10[qindex];
211      break;
212    case VPX_BITS_12:
213      x->sadperbit16 = sad_per_bit16lut_12[qindex];
214      x->sadperbit4 = sad_per_bit4lut_12[qindex];
215      break;
216    default:
217      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
218  }
219#else
220  (void)cpi;
221  x->sadperbit16 = sad_per_bit16lut_8[qindex];
222  x->sadperbit4 = sad_per_bit4lut_8[qindex];
223#endif  // CONFIG_VP9_HIGHBITDEPTH
224}
225
226static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
227  int i, bsize, segment_id;
228
229  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
230    const int qindex =
231        clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
232                  cm->y_dc_delta_q,
233              0, MAXQ);
234    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
235
236    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
237      // Threshold here seems unnecessarily harsh but fine given actual
238      // range of values used for cpi->sf.thresh_mult[].
239      const int t = q * rd_thresh_block_size_factor[bsize];
240      const int thresh_max = INT_MAX / t;
241
242      if (bsize >= BLOCK_8X8) {
243        for (i = 0; i < MAX_MODES; ++i)
244          rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
245                                                   ? rd->thresh_mult[i] * t / 4
246                                                   : INT_MAX;
247      } else {
248        for (i = 0; i < MAX_REFS; ++i)
249          rd->threshes[segment_id][bsize][i] =
250              rd->thresh_mult_sub8x8[i] < thresh_max
251                  ? rd->thresh_mult_sub8x8[i] * t / 4
252                  : INT_MAX;
253      }
254    }
255  }
256}
257
258void vp9_initialize_rd_consts(VP9_COMP *cpi) {
259  VP9_COMMON *const cm = &cpi->common;
260  MACROBLOCK *const x = &cpi->td.mb;
261  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
262  RD_OPT *const rd = &cpi->rd;
263  int i;
264
265  vpx_clear_system_state();
266
267  rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
268  rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
269
270  set_error_per_bit(x, rd->RDMULT);
271
272  x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
273                       cm->frame_type != KEY_FRAME)
274                          ? 0
275                          : 1;
276
277  set_block_thresholds(cm, rd);
278  set_partition_probs(cm, xd);
279
280  if (cpi->oxcf.pass == 1) {
281    if (!frame_is_intra_only(cm))
282      vp9_build_nmv_cost_table(
283          x->nmvjointcost,
284          cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
285          &cm->fc->nmvc, cm->allow_high_precision_mv);
286  } else {
287    if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
288      fill_token_costs(x->token_costs, cm->fc->coef_probs);
289
290    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
291        cm->frame_type == KEY_FRAME) {
292      for (i = 0; i < PARTITION_CONTEXTS; ++i)
293        vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
294                        vp9_partition_tree);
295    }
296
297    if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
298        cm->frame_type == KEY_FRAME) {
299      fill_mode_costs(cpi);
300
301      if (!frame_is_intra_only(cm)) {
302        vp9_build_nmv_cost_table(
303            x->nmvjointcost,
304            cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
305            &cm->fc->nmvc, cm->allow_high_precision_mv);
306
307        for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
308          vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
309                          cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
310      }
311    }
312  }
313}
314
315// NOTE: The tables below must be of the same size.
316
317// The functions described below are sampled at the four most significant
318// bits of x^2 + 8 / 256.
319
320// Normalized rate:
321// This table models the rate for a Laplacian source with given variance
322// when quantized with a uniform quantizer with given stepsize. The
323// closed form expression is:
324// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
325// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
326// and H(x) is the binary entropy function.
327static const int rate_tab_q10[] = {
328  65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
329  3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
330  2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
331  2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
332  1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
333  680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
334  190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
335  16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
336};
337
338// Normalized distortion:
339// This table models the normalized distortion for a Laplacian source
340// with given variance when quantized with a uniform quantizer
341// with given stepsize. The closed form expression is:
342// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
343// where x = qpstep / sqrt(variance).
344// Note the actual distortion is Dn * variance.
345static const int dist_tab_q10[] = {
346  0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
347  6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
348  24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
349  73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
350  215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
351  495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
352  842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
353  1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
354};
355static const int xsq_iq_q10[] = {
356  0,      4,      8,      12,     16,     20,     24,     28,     32,
357  40,     48,     56,     64,     72,     80,     88,     96,     112,
358  128,    144,    160,    176,    192,    208,    224,    256,    288,
359  320,    352,    384,    416,    448,    480,    544,    608,    672,
360  736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
361  1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
362  3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
363  7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
364  16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
365  36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
366  81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
367  180192, 196576, 212960, 229344, 245728,
368};
369
370static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
371  const int tmp = (xsq_q10 >> 2) + 8;
372  const int k = get_msb(tmp) - 3;
373  const int xq = (k << 3) + ((tmp >> k) & 0x7);
374  const int one_q10 = 1 << 10;
375  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
376  const int b_q10 = one_q10 - a_q10;
377  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
378  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
379}
380
381static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
382                              int r_q10[MAX_MB_PLANE],
383                              int d_q10[MAX_MB_PLANE]) {
384  int i;
385  const int one_q10 = 1 << 10;
386  for (i = 0; i < MAX_MB_PLANE; ++i) {
387    const int tmp = (xsq_q10[i] >> 2) + 8;
388    const int k = get_msb(tmp) - 3;
389    const int xq = (k << 3) + ((tmp >> k) & 0x7);
390    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
391    const int b_q10 = one_q10 - a_q10;
392    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
393    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
394  }
395}
396
397static const uint32_t MAX_XSQ_Q10 = 245727;
398
399void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
400                                  unsigned int qstep, int *rate,
401                                  int64_t *dist) {
402  // This function models the rate and distortion for a Laplacian
403  // source with given variance when quantized with a uniform quantizer
404  // with given stepsize. The closed form expressions are in:
405  // Hang and Chen, "Source Model for transform video coder and its
406  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
407  // Sys. for Video Tech., April 1997.
408  if (var == 0) {
409    *rate = 0;
410    *dist = 0;
411  } else {
412    int d_q10, r_q10;
413    const uint64_t xsq_q10_64 =
414        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
415    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
416    model_rd_norm(xsq_q10, &r_q10, &d_q10);
417    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
418    *dist = (var * (int64_t)d_q10 + 512) >> 10;
419  }
420}
421
422// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
423// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
424void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
425                                      unsigned int n_log2[MAX_MB_PLANE],
426                                      unsigned int qstep[MAX_MB_PLANE],
427                                      int64_t *rate_sum, int64_t *dist_sum) {
428  int i;
429  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
430  for (i = 0; i < MAX_MB_PLANE; ++i) {
431    const uint64_t xsq_q10_64 =
432        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
433        var[i];
434    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
435  }
436  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
437  for (i = 0; i < MAX_MB_PLANE; ++i) {
438    int rate =
439        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
440    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
441    *rate_sum += rate;
442    *dist_sum += dist;
443  }
444}
445
446void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
447                              const struct macroblockd_plane *pd,
448                              ENTROPY_CONTEXT t_above[16],
449                              ENTROPY_CONTEXT t_left[16]) {
450  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
451  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
452  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
453  const ENTROPY_CONTEXT *const above = pd->above_context;
454  const ENTROPY_CONTEXT *const left = pd->left_context;
455
456  int i;
457  switch (tx_size) {
458    case TX_4X4:
459      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
460      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
461      break;
462    case TX_8X8:
463      for (i = 0; i < num_4x4_w; i += 2)
464        t_above[i] = !!*(const uint16_t *)&above[i];
465      for (i = 0; i < num_4x4_h; i += 2)
466        t_left[i] = !!*(const uint16_t *)&left[i];
467      break;
468    case TX_16X16:
469      for (i = 0; i < num_4x4_w; i += 4)
470        t_above[i] = !!*(const uint32_t *)&above[i];
471      for (i = 0; i < num_4x4_h; i += 4)
472        t_left[i] = !!*(const uint32_t *)&left[i];
473      break;
474    case TX_32X32:
475      for (i = 0; i < num_4x4_w; i += 8)
476        t_above[i] = !!*(const uint64_t *)&above[i];
477      for (i = 0; i < num_4x4_h; i += 8)
478        t_left[i] = !!*(const uint64_t *)&left[i];
479      break;
480    default: assert(0 && "Invalid transform size."); break;
481  }
482}
483
484void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
485                 int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
486  int i;
487  int zero_seen = 0;
488  int best_index = 0;
489  int best_sad = INT_MAX;
490  int this_sad = INT_MAX;
491  int max_mv = 0;
492  int near_same_nearest;
493  uint8_t *src_y_ptr = x->plane[0].src.buf;
494  uint8_t *ref_y_ptr;
495  const int num_mv_refs =
496      MAX_MV_REF_CANDIDATES +
497      (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
498
499  MV pred_mv[3];
500  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
501  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
502  pred_mv[2] = x->pred_mv[ref_frame];
503  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
504
505  near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
506                      x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
507  // Get the sad for each candidate reference mv.
508  for (i = 0; i < num_mv_refs; ++i) {
509    const MV *this_mv = &pred_mv[i];
510    int fp_row, fp_col;
511
512    if (i == 1 && near_same_nearest) continue;
513    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
514    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
515    max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
516
517    if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
518    zero_seen |= (fp_row == 0 && fp_col == 0);
519
520    ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
521    // Find sad for current vector.
522    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
523                                           ref_y_ptr, ref_y_stride);
524    // Note if it is the best so far.
525    if (this_sad < best_sad) {
526      best_sad = this_sad;
527      best_index = i;
528    }
529  }
530
531  // Note the index of the mv that worked best in the reference list.
532  x->mv_best_ref_index[ref_frame] = best_index;
533  x->max_mv_context[ref_frame] = max_mv;
534  x->pred_mv_sad[ref_frame] = best_sad;
535}
536
537void vp9_setup_pred_block(const MACROBLOCKD *xd,
538                          struct buf_2d dst[MAX_MB_PLANE],
539                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
540                          const struct scale_factors *scale,
541                          const struct scale_factors *scale_uv) {
542  int i;
543
544  dst[0].buf = src->y_buffer;
545  dst[0].stride = src->y_stride;
546  dst[1].buf = src->u_buffer;
547  dst[2].buf = src->v_buffer;
548  dst[1].stride = dst[2].stride = src->uv_stride;
549
550  for (i = 0; i < MAX_MB_PLANE; ++i) {
551    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
552                     i ? scale_uv : scale, xd->plane[i].subsampling_x,
553                     xd->plane[i].subsampling_y);
554  }
555}
556
557int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
558                            int stride) {
559  const int bw = b_width_log2_lookup[plane_bsize];
560  const int y = 4 * (raster_block >> bw);
561  const int x = 4 * (raster_block & ((1 << bw) - 1));
562  return y * stride + x;
563}
564
565int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
566                                       int16_t *base) {
567  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
568  return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
569}
570
571YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
572                                             int ref_frame) {
573  const VP9_COMMON *const cm = &cpi->common;
574  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
575  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
576  return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
577             ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
578             : NULL;
579}
580
581int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
582  const MODE_INFO *const mi = xd->mi[0];
583  const int ctx = get_pred_context_switchable_interp(xd);
584  return SWITCHABLE_INTERP_RATE_FACTOR *
585         cpi->switchable_interp_costs[ctx][mi->interp_filter];
586}
587
588void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
589  int i;
590  RD_OPT *const rd = &cpi->rd;
591  SPEED_FEATURES *const sf = &cpi->sf;
592
593  // Set baseline threshold values.
594  for (i = 0; i < MAX_MODES; ++i)
595    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
596
597  if (sf->adaptive_rd_thresh) {
598    rd->thresh_mult[THR_NEARESTMV] = 300;
599    rd->thresh_mult[THR_NEARESTG] = 300;
600    rd->thresh_mult[THR_NEARESTA] = 300;
601  } else {
602    rd->thresh_mult[THR_NEARESTMV] = 0;
603    rd->thresh_mult[THR_NEARESTG] = 0;
604    rd->thresh_mult[THR_NEARESTA] = 0;
605  }
606
607  rd->thresh_mult[THR_DC] += 1000;
608
609  rd->thresh_mult[THR_NEWMV] += 1000;
610  rd->thresh_mult[THR_NEWA] += 1000;
611  rd->thresh_mult[THR_NEWG] += 1000;
612
613  rd->thresh_mult[THR_NEARMV] += 1000;
614  rd->thresh_mult[THR_NEARA] += 1000;
615  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
616  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
617
618  rd->thresh_mult[THR_TM] += 1000;
619
620  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
621  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
622  rd->thresh_mult[THR_NEARG] += 1000;
623  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
624  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
625
626  rd->thresh_mult[THR_ZEROMV] += 2000;
627  rd->thresh_mult[THR_ZEROG] += 2000;
628  rd->thresh_mult[THR_ZEROA] += 2000;
629  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
630  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
631
632  rd->thresh_mult[THR_H_PRED] += 2000;
633  rd->thresh_mult[THR_V_PRED] += 2000;
634  rd->thresh_mult[THR_D45_PRED] += 2500;
635  rd->thresh_mult[THR_D135_PRED] += 2500;
636  rd->thresh_mult[THR_D117_PRED] += 2500;
637  rd->thresh_mult[THR_D153_PRED] += 2500;
638  rd->thresh_mult[THR_D207_PRED] += 2500;
639  rd->thresh_mult[THR_D63_PRED] += 2500;
640}
641
642void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
643  static const int thresh_mult[2][MAX_REFS] = {
644    { 2500, 2500, 2500, 4500, 4500, 2500 },
645    { 2000, 2000, 2000, 4000, 4000, 2000 }
646  };
647  RD_OPT *const rd = &cpi->rd;
648  const int idx = cpi->oxcf.mode == BEST;
649  memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
650}
651
652void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
653                               int bsize, int best_mode_index) {
654  if (rd_thresh > 0) {
655    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
656    int mode;
657    for (mode = 0; mode < top_mode; ++mode) {
658      const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
659      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
660      BLOCK_SIZE bs;
661      for (bs = min_size; bs <= max_size; ++bs) {
662        int *const fact = &factor_buf[bs][mode];
663        if (mode == best_mode_index) {
664          *fact -= (*fact >> 4);
665        } else {
666          *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
667        }
668      }
669    }
670  }
671}
672
673int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
674                               int qindex, int qdelta) {
675  // Reduce the intra cost penalty for small blocks (<=16x16).
676  int reduction_fac =
677      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
678
679  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
680    // Don't reduce intra cost penalty if estimated noise level is high.
681    reduction_fac = 0;
682
683  // Always use VPX_BITS_8 as input here because the penalty is applied
684  // to rate not distortion so we want a consistent penalty for all bit
685  // depths. If the actual bit depth were passed in here then the value
686  // retured by vp9_dc_quant() would scale with the bit depth and we would
687  // then need to apply inverse scaling to correct back to a bit depth
688  // independent rate penalty.
689  return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
690}
691