1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <limits.h>
13#include <math.h>
14#include <stdio.h>
15
16#include "./vp9_rtcd.h"
17
18#include "vpx_mem/vpx_mem.h"
19
20#include "vp9/common/vp9_common.h"
21#include "vp9/common/vp9_entropy.h"
22#include "vp9/common/vp9_entropymode.h"
23#include "vp9/common/vp9_idct.h"
24#include "vp9/common/vp9_mvref_common.h"
25#include "vp9/common/vp9_pragmas.h"
26#include "vp9/common/vp9_pred_common.h"
27#include "vp9/common/vp9_quant_common.h"
28#include "vp9/common/vp9_reconinter.h"
29#include "vp9/common/vp9_reconintra.h"
30#include "vp9/common/vp9_seg_common.h"
31#include "vp9/common/vp9_systemdependent.h"
32
33#include "vp9/encoder/vp9_cost.h"
34#include "vp9/encoder/vp9_encodemb.h"
35#include "vp9/encoder/vp9_encodemv.h"
36#include "vp9/encoder/vp9_mcomp.h"
37#include "vp9/encoder/vp9_onyx_int.h"
38#include "vp9/encoder/vp9_quantize.h"
39#include "vp9/encoder/vp9_ratectrl.h"
40#include "vp9/encoder/vp9_rdopt.h"
41#include "vp9/encoder/vp9_tokenize.h"
42#include "vp9/encoder/vp9_variance.h"
43
44#define RD_THRESH_MAX_FACT 64
45#define RD_THRESH_INC      1
46#define RD_THRESH_POW      1.25
47#define RD_MULT_EPB_RATIO  64
48
49/* Factor to weigh the rate for switchable interp filters */
50#define SWITCHABLE_INTERP_RATE_FACTOR 1
51
52#define LAST_FRAME_MODE_MASK    0xFFEDCD60
53#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
54#define ALT_REF_MODE_MASK       0xFFC648D0
55
56#define MIN_EARLY_TERM_INDEX    3
57
58typedef struct {
59  MB_PREDICTION_MODE mode;
60  MV_REFERENCE_FRAME ref_frame[2];
61} MODE_DEFINITION;
62
63typedef struct {
64  MV_REFERENCE_FRAME ref_frame[2];
65} REF_DEFINITION;
66
67struct rdcost_block_args {
68  MACROBLOCK *x;
69  ENTROPY_CONTEXT t_above[16];
70  ENTROPY_CONTEXT t_left[16];
71  int rate;
72  int64_t dist;
73  int64_t sse;
74  int this_rate;
75  int64_t this_dist;
76  int64_t this_sse;
77  int64_t this_rd;
78  int64_t best_rd;
79  int skip;
80  int use_fast_coef_costing;
81  const scan_order *so;
82};
83
84const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
85  {NEARESTMV, {LAST_FRAME,   NONE}},
86  {NEARESTMV, {ALTREF_FRAME, NONE}},
87  {NEARESTMV, {GOLDEN_FRAME, NONE}},
88
89  {DC_PRED,   {INTRA_FRAME,  NONE}},
90
91  {NEWMV,     {LAST_FRAME,   NONE}},
92  {NEWMV,     {ALTREF_FRAME, NONE}},
93  {NEWMV,     {GOLDEN_FRAME, NONE}},
94
95  {NEARMV,    {LAST_FRAME,   NONE}},
96  {NEARMV,    {ALTREF_FRAME, NONE}},
97  {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
98  {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
99
100  {TM_PRED,   {INTRA_FRAME,  NONE}},
101
102  {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
103  {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
104  {NEARMV,    {GOLDEN_FRAME, NONE}},
105  {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
106  {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
107
108  {ZEROMV,    {LAST_FRAME,   NONE}},
109  {ZEROMV,    {GOLDEN_FRAME, NONE}},
110  {ZEROMV,    {ALTREF_FRAME, NONE}},
111  {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
112  {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
113
114  {H_PRED,    {INTRA_FRAME,  NONE}},
115  {V_PRED,    {INTRA_FRAME,  NONE}},
116  {D135_PRED, {INTRA_FRAME,  NONE}},
117  {D207_PRED, {INTRA_FRAME,  NONE}},
118  {D153_PRED, {INTRA_FRAME,  NONE}},
119  {D63_PRED,  {INTRA_FRAME,  NONE}},
120  {D117_PRED, {INTRA_FRAME,  NONE}},
121  {D45_PRED,  {INTRA_FRAME,  NONE}},
122};
123
124const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
125  {{LAST_FRAME,   NONE}},
126  {{GOLDEN_FRAME, NONE}},
127  {{ALTREF_FRAME, NONE}},
128  {{LAST_FRAME,   ALTREF_FRAME}},
129  {{GOLDEN_FRAME, ALTREF_FRAME}},
130  {{INTRA_FRAME,  NONE}},
131};
132
133// The baseline rd thresholds for breaking out of the rd loop for
134// certain modes are assumed to be based on 8x8 blocks.
135// This table is used to correct for blocks size.
136// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
137static int rd_thresh_block_size_factor[BLOCK_SIZES] =
138  {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
139
140static int raster_block_offset(BLOCK_SIZE plane_bsize,
141                               int raster_block, int stride) {
142  const int bw = b_width_log2(plane_bsize);
143  const int y = 4 * (raster_block >> bw);
144  const int x = 4 * (raster_block & ((1 << bw) - 1));
145  return y * stride + x;
146}
147static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
148                                          int raster_block, int16_t *base) {
149  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
150  return base + raster_block_offset(plane_bsize, raster_block, stride);
151}
152
153static void fill_mode_costs(VP9_COMP *cpi) {
154  MACROBLOCK *const x = &cpi->mb;
155  const FRAME_CONTEXT *const fc = &cpi->common.fc;
156  int i, j;
157
158  for (i = 0; i < INTRA_MODES; i++)
159    for (j = 0; j < INTRA_MODES; j++)
160      vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
161                      vp9_intra_mode_tree);
162
163  // TODO(rbultje) separate tables for superblock costing?
164  vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
165  vp9_cost_tokens(x->intra_uv_mode_cost[KEY_FRAME],
166                  vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
167  vp9_cost_tokens(x->intra_uv_mode_cost[INTER_FRAME],
168                  fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
169
170  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
171    vp9_cost_tokens((int *)x->switchable_interp_costs[i],
172                    fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
173}
174
175static void fill_token_costs(vp9_coeff_cost *c,
176                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
177  int i, j, k, l;
178  TX_SIZE t;
179  for (t = TX_4X4; t <= TX_32X32; ++t)
180    for (i = 0; i < PLANE_TYPES; ++i)
181      for (j = 0; j < REF_TYPES; ++j)
182        for (k = 0; k < COEF_BANDS; ++k)
183          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
184            vp9_prob probs[ENTROPY_NODES];
185            vp9_model_to_full_probs(p[t][i][j][k][l], probs);
186            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
187                            vp9_coef_tree);
188            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
189                                 vp9_coef_tree);
190            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
191                   c[t][i][j][k][1][l][EOB_TOKEN]);
192          }
193}
194
195static const int rd_iifactor[32] = {
196  4, 4, 3, 2, 1, 0, 0, 0,
197  0, 0, 0, 0, 0, 0, 0, 0,
198  0, 0, 0, 0, 0, 0, 0, 0,
199  0, 0, 0, 0, 0, 0, 0, 0,
200};
201
202// 3* dc_qlookup[Q]*dc_qlookup[Q];
203
204/* values are now correlated to quantizer */
205static int sad_per_bit16lut[QINDEX_RANGE];
206static int sad_per_bit4lut[QINDEX_RANGE];
207
208void vp9_init_me_luts() {
209  int i;
210
211  // Initialize the sad lut tables using a formulaic calculation for now
212  // This is to make it easier to resolve the impact of experimental changes
213  // to the quantizer tables.
214  for (i = 0; i < QINDEX_RANGE; i++) {
215    const double q = vp9_convert_qindex_to_q(i);
216    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
217    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
218  }
219}
220
221int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
222  const int q = vp9_dc_quant(qindex, 0);
223  // TODO(debargha): Adjust the function below
224  int rdmult = 88 * q * q / 25;
225  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
226    if (cpi->twopass.next_iiratio > 31)
227      rdmult += (rdmult * rd_iifactor[31]) >> 4;
228    else
229      rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
230  }
231  return rdmult;
232}
233
234static int compute_rd_thresh_factor(int qindex) {
235  // TODO(debargha): Adjust the function below
236  const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
237  return MAX(q, 8);
238}
239
240void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
241  cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
242  cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
243}
244
245static void set_block_thresholds(VP9_COMP *cpi) {
246  const VP9_COMMON *const cm = &cpi->common;
247  int i, bsize, segment_id;
248
249  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
250    const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
251                                            cm->base_qindex) + cm->y_dc_delta_q,
252                             0, MAXQ);
253    const int q = compute_rd_thresh_factor(qindex);
254
255    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
256      // Threshold here seems unnecessarily harsh but fine given actual
257      // range of values used for cpi->sf.thresh_mult[].
258      const int t = q * rd_thresh_block_size_factor[bsize];
259      const int thresh_max = INT_MAX / t;
260
261      for (i = 0; i < MAX_MODES; ++i)
262        cpi->rd_threshes[segment_id][bsize][i] =
263            cpi->rd_thresh_mult[i] < thresh_max ? cpi->rd_thresh_mult[i] * t / 4
264                                            : INT_MAX;
265
266      for (i = 0; i < MAX_REFS; ++i) {
267        cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
268            cpi->rd_thresh_mult_sub8x8[i] < thresh_max
269                ? cpi->rd_thresh_mult_sub8x8[i] * t / 4
270                : INT_MAX;
271      }
272    }
273  }
274}
275
276void vp9_initialize_rd_consts(VP9_COMP *cpi) {
277  VP9_COMMON *const cm = &cpi->common;
278  MACROBLOCK *const x = &cpi->mb;
279  int i;
280
281  vp9_clear_system_state();
282
283  cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
284  cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
285
286  x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
287  x->errorperbit += (x->errorperbit == 0);
288
289  x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
290                         cm->frame_type != KEY_FRAME) ? 0 : 1;
291
292  set_block_thresholds(cpi);
293
294  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
295    fill_token_costs(x->token_costs, cm->fc.coef_probs);
296
297    for (i = 0; i < PARTITION_CONTEXTS; i++)
298      vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
299                      vp9_partition_tree);
300  }
301
302  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
303      cm->frame_type == KEY_FRAME) {
304    fill_mode_costs(cpi);
305
306    if (!frame_is_intra_only(cm)) {
307      vp9_build_nmv_cost_table(x->nmvjointcost,
308                               cm->allow_high_precision_mv ? x->nmvcost_hp
309                                                           : x->nmvcost,
310                               &cm->fc.nmvc, cm->allow_high_precision_mv);
311
312      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
313        vp9_cost_tokens((int *)x->inter_mode_cost[i],
314                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
315    }
316  }
317}
318
319static const int MAX_XSQ_Q10 = 245727;
320
321static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
322  // NOTE: The tables below must be of the same size
323
324  // The functions described below are sampled at the four most significant
325  // bits of x^2 + 8 / 256
326
327  // Normalized rate
328  // This table models the rate for a Laplacian source
329  // source with given variance when quantized with a uniform quantizer
330  // with given stepsize. The closed form expression is:
331  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
332  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
333  // and H(x) is the binary entropy function.
334  static const int rate_tab_q10[] = {
335    65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
336     4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
337     3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
338     3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
339     2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
340     2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
341     1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
342     1159,  1086,  1021,   963,   911,   864,   821,   781,
343      745,   680,   623,   574,   530,   490,   455,   424,
344      395,   345,   304,   269,   239,   213,   190,   171,
345      154,   126,   104,    87,    73,    61,    52,    44,
346       38,    28,    21,    16,    12,    10,     8,     6,
347        5,     3,     2,     1,     1,     1,     0,     0,
348  };
349  // Normalized distortion
350  // This table models the normalized distortion for a Laplacian source
351  // source with given variance when quantized with a uniform quantizer
352  // with given stepsize. The closed form expression is:
353  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
354  // where x = qpstep / sqrt(variance)
355  // Note the actual distortion is Dn * variance.
356  static const int dist_tab_q10[] = {
357       0,     0,     1,     1,     1,     2,     2,     2,
358       3,     3,     4,     5,     5,     6,     7,     7,
359       8,     9,    11,    12,    13,    15,    16,    17,
360      18,    21,    24,    26,    29,    31,    34,    36,
361      39,    44,    49,    54,    59,    64,    69,    73,
362      78,    88,    97,   106,   115,   124,   133,   142,
363     151,   167,   184,   200,   215,   231,   245,   260,
364     274,   301,   327,   351,   375,   397,   418,   439,
365     458,   495,   528,   559,   587,   613,   637,   659,
366     680,   717,   749,   777,   801,   823,   842,   859,
367     874,   899,   919,   936,   949,   960,   969,   977,
368     983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
369    1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
370  };
371  static const int xsq_iq_q10[] = {
372         0,      4,      8,     12,     16,     20,     24,     28,
373        32,     40,     48,     56,     64,     72,     80,     88,
374        96,    112,    128,    144,    160,    176,    192,    208,
375       224,    256,    288,    320,    352,    384,    416,    448,
376       480,    544,    608,    672,    736,    800,    864,    928,
377       992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
378      2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
379      4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
380      8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
381     16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
382     32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
383     65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
384    131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
385  };
386  /*
387  static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]);
388  assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size);
389  assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size);
390  assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]);
391  */
392  int tmp = (xsq_q10 >> 2) + 8;
393  int k = get_msb(tmp) - 3;
394  int xq = (k << 3) + ((tmp >> k) & 0x7);
395  const int one_q10 = 1 << 10;
396  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
397  const int b_q10 = one_q10 - a_q10;
398  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
399  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
400}
401
402void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
403                                  unsigned int qstep, int *rate,
404                                  int64_t *dist) {
405  // This function models the rate and distortion for a Laplacian
406  // source with given variance when quantized with a uniform quantizer
407  // with given stepsize. The closed form expressions are in:
408  // Hang and Chen, "Source Model for transform video coder and its
409  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
410  // Sys. for Video Tech., April 1997.
411  if (var == 0) {
412    *rate = 0;
413    *dist = 0;
414  } else {
415    int d_q10, r_q10;
416    const uint64_t xsq_q10_64 =
417        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
418    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
419                        MAX_XSQ_Q10 : (int)xsq_q10_64;
420    model_rd_norm(xsq_q10, &r_q10, &d_q10);
421    *rate = (n * r_q10 + 2) >> 2;
422    *dist = (var * (int64_t)d_q10 + 512) >> 10;
423  }
424}
425
426static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
427                            MACROBLOCK *x, MACROBLOCKD *xd,
428                            int *out_rate_sum, int64_t *out_dist_sum) {
429  // Note our transform coeffs are 8 times an orthogonal transform.
430  // Hence quantizer step is also 8 times. To get effective quantizer
431  // we need to divide by 8 before sending to modeling function.
432  int i;
433  int64_t rate_sum = 0;
434  int64_t dist_sum = 0;
435  const int ref = xd->mi[0]->mbmi.ref_frame[0];
436  unsigned int sse;
437
438  for (i = 0; i < MAX_MB_PLANE; ++i) {
439    struct macroblock_plane *const p = &x->plane[i];
440    struct macroblockd_plane *const pd = &xd->plane[i];
441    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
442
443    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
444                              pd->dst.buf, pd->dst.stride, &sse);
445
446    if (i == 0)
447      x->pred_sse[ref] = sse;
448
449    // Fast approximate the modelling function.
450    if (cpi->speed > 4) {
451      int64_t rate;
452      int64_t dist;
453      int64_t square_error = sse;
454      int quantizer = (pd->dequant[1] >> 3);
455
456      if (quantizer < 120)
457        rate = (square_error * (280 - quantizer)) >> 8;
458      else
459        rate = 0;
460      dist = (square_error * quantizer) >> 8;
461      rate_sum += rate;
462      dist_sum += dist;
463    } else {
464      int rate;
465      int64_t dist;
466      vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
467                                   pd->dequant[1] >> 3, &rate, &dist);
468      rate_sum += rate;
469      dist_sum += dist;
470    }
471  }
472
473  *out_rate_sum = (int)rate_sum;
474  *out_dist_sum = dist_sum << 4;
475}
476
477static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
478                                 TX_SIZE tx_size,
479                                 MACROBLOCK *x, MACROBLOCKD *xd,
480                                 int *out_rate_sum, int64_t *out_dist_sum,
481                                 int *out_skip) {
482  int j, k;
483  BLOCK_SIZE bs;
484  const struct macroblock_plane *const p = &x->plane[0];
485  const struct macroblockd_plane *const pd = &xd->plane[0];
486  const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
487  const int height = 4 * num_4x4_blocks_high_lookup[bsize];
488  int rate_sum = 0;
489  int64_t dist_sum = 0;
490  const int t = 4 << tx_size;
491
492  if (tx_size == TX_4X4) {
493    bs = BLOCK_4X4;
494  } else if (tx_size == TX_8X8) {
495    bs = BLOCK_8X8;
496  } else if (tx_size == TX_16X16) {
497    bs = BLOCK_16X16;
498  } else if (tx_size == TX_32X32) {
499    bs = BLOCK_32X32;
500  } else {
501    assert(0);
502  }
503
504  *out_skip = 1;
505  for (j = 0; j < height; j += t) {
506    for (k = 0; k < width; k += t) {
507      int rate;
508      int64_t dist;
509      unsigned int sse;
510      cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
511                         &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
512                         &sse);
513      // sse works better than var, since there is no dc prediction used
514      vp9_model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
515                                   &rate, &dist);
516      rate_sum += rate;
517      dist_sum += dist;
518      *out_skip &= (rate < 1024);
519    }
520  }
521
522  *out_rate_sum = rate_sum;
523  *out_dist_sum = dist_sum << 4;
524}
525
526int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
527                          intptr_t block_size, int64_t *ssz) {
528  int i;
529  int64_t error = 0, sqcoeff = 0;
530
531  for (i = 0; i < block_size; i++) {
532    const int diff = coeff[i] - dqcoeff[i];
533    error +=  diff * diff;
534    sqcoeff += coeff[i] * coeff[i];
535  }
536
537  *ssz = sqcoeff;
538  return error;
539}
540
541/* The trailing '0' is a terminator which is used inside cost_coeffs() to
542 * decide whether to include cost of a trailing EOB node or not (i.e. we
543 * can skip this if the last coefficient in this transform block, e.g. the
544 * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
545 * were non-zero). */
546static const int16_t band_counts[TX_SIZES][8] = {
547  { 1, 2, 3, 4,  3,   16 - 13, 0 },
548  { 1, 2, 3, 4, 11,   64 - 21, 0 },
549  { 1, 2, 3, 4, 11,  256 - 21, 0 },
550  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
551};
552static INLINE int cost_coeffs(MACROBLOCK *x,
553                              int plane, int block,
554                              ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
555                              TX_SIZE tx_size,
556                              const int16_t *scan, const int16_t *nb,
557                              int use_fast_coef_costing) {
558  MACROBLOCKD *const xd = &x->e_mbd;
559  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
560  const struct macroblock_plane *p = &x->plane[plane];
561  const struct macroblockd_plane *pd = &xd->plane[plane];
562  const PLANE_TYPE type = pd->plane_type;
563  const int16_t *band_count = &band_counts[tx_size][1];
564  const int eob = p->eobs[block];
565  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
566  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
567                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
568  uint8_t token_cache[32 * 32];
569  int pt = combine_entropy_contexts(*A, *L);
570  int c, cost;
571  // Check for consistency of tx_size with mode info
572  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
573                              : get_uv_tx_size(mbmi) == tx_size);
574
575  if (eob == 0) {
576    // single eob token
577    cost = token_costs[0][0][pt][EOB_TOKEN];
578    c = 0;
579  } else {
580    int band_left = *band_count++;
581
582    // dc token
583    int v = qcoeff[0];
584    int prev_t = vp9_dct_value_tokens_ptr[v].token;
585    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
586    token_cache[0] = vp9_pt_energy_class[prev_t];
587    ++token_costs;
588
589    // ac tokens
590    for (c = 1; c < eob; c++) {
591      const int rc = scan[c];
592      int t;
593
594      v = qcoeff[rc];
595      t = vp9_dct_value_tokens_ptr[v].token;
596      if (use_fast_coef_costing) {
597        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
598      } else {
599        pt = get_coef_context(nb, token_cache, c);
600        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
601        token_cache[rc] = vp9_pt_energy_class[t];
602      }
603      prev_t = t;
604      if (!--band_left) {
605        band_left = *band_count++;
606        ++token_costs;
607      }
608    }
609
610    // eob token
611    if (band_left) {
612      if (use_fast_coef_costing) {
613        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
614      } else {
615        pt = get_coef_context(nb, token_cache, c);
616        cost += (*token_costs)[0][pt][EOB_TOKEN];
617      }
618    }
619  }
620
621  // is eob first coefficient;
622  *A = *L = (c > 0);
623
624  return cost;
625}
626static void dist_block(int plane, int block, TX_SIZE tx_size,
627                       struct rdcost_block_args* args) {
628  const int ss_txfrm_size = tx_size << 1;
629  MACROBLOCK* const x = args->x;
630  MACROBLOCKD* const xd = &x->e_mbd;
631  const struct macroblock_plane *const p = &x->plane[plane];
632  const struct macroblockd_plane *const pd = &xd->plane[plane];
633  int64_t this_sse;
634  int shift = tx_size == TX_32X32 ? 0 : 2;
635  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
636  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
637  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
638                               &this_sse) >> shift;
639  args->sse  = this_sse >> shift;
640
641  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
642    // TODO(jingning): tune the model to better capture the distortion.
643    int64_t p = (pd->dequant[1] * pd->dequant[1] *
644                    (1 << ss_txfrm_size)) >> (shift + 2);
645    args->dist += (p >> 4);
646    args->sse  += p;
647  }
648}
649
650static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
651                       TX_SIZE tx_size, struct rdcost_block_args* args) {
652  int x_idx, y_idx;
653  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
654
655  args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
656                           args->t_left + y_idx, tx_size,
657                           args->so->scan, args->so->neighbors,
658                           args->use_fast_coef_costing);
659}
660
661static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
662                          TX_SIZE tx_size, void *arg) {
663  struct rdcost_block_args *args = arg;
664  MACROBLOCK *const x = args->x;
665  MACROBLOCKD *const xd = &x->e_mbd;
666  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
667  int64_t rd1, rd2, rd;
668
669  if (args->skip)
670    return;
671
672  if (!is_inter_block(mbmi))
673    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
674  else
675    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
676
677  dist_block(plane, block, tx_size, args);
678  rate_block(plane, block, plane_bsize, tx_size, args);
679  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
680  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
681
682  // TODO(jingning): temporarily enabled only for luma component
683  rd = MIN(rd1, rd2);
684  if (plane == 0)
685    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
686                                    (rd1 > rd2 && !xd->lossless);
687
688  args->this_rate += args->rate;
689  args->this_dist += args->dist;
690  args->this_sse  += args->sse;
691  args->this_rd += rd;
692
693  if (args->this_rd > args->best_rd) {
694    args->skip = 1;
695    return;
696  }
697}
698
699void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
700                              const struct macroblockd_plane *pd,
701                              ENTROPY_CONTEXT t_above[16],
702                              ENTROPY_CONTEXT t_left[16]) {
703  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
704  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
705  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
706  const ENTROPY_CONTEXT *const above = pd->above_context;
707  const ENTROPY_CONTEXT *const left = pd->left_context;
708
709  int i;
710  switch (tx_size) {
711    case TX_4X4:
712      vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
713      vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
714      break;
715    case TX_8X8:
716      for (i = 0; i < num_4x4_w; i += 2)
717        t_above[i] = !!*(const uint16_t *)&above[i];
718      for (i = 0; i < num_4x4_h; i += 2)
719        t_left[i] = !!*(const uint16_t *)&left[i];
720      break;
721    case TX_16X16:
722      for (i = 0; i < num_4x4_w; i += 4)
723        t_above[i] = !!*(const uint32_t *)&above[i];
724      for (i = 0; i < num_4x4_h; i += 4)
725        t_left[i] = !!*(const uint32_t *)&left[i];
726      break;
727    case TX_32X32:
728      for (i = 0; i < num_4x4_w; i += 8)
729        t_above[i] = !!*(const uint64_t *)&above[i];
730      for (i = 0; i < num_4x4_h; i += 8)
731        t_left[i] = !!*(const uint64_t *)&left[i];
732      break;
733    default:
734      assert(0 && "Invalid transform size.");
735  }
736}
737
738static void txfm_rd_in_plane(MACROBLOCK *x,
739                             int *rate, int64_t *distortion,
740                             int *skippable, int64_t *sse,
741                             int64_t ref_best_rd, int plane,
742                             BLOCK_SIZE bsize, TX_SIZE tx_size,
743                             int use_fast_coef_casting) {
744  MACROBLOCKD *const xd = &x->e_mbd;
745  const struct macroblockd_plane *const pd = &xd->plane[plane];
746  struct rdcost_block_args args = { 0 };
747  args.x = x;
748  args.best_rd = ref_best_rd;
749  args.use_fast_coef_costing = use_fast_coef_casting;
750
751  if (plane == 0)
752    xd->mi[0]->mbmi.tx_size = tx_size;
753
754  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
755
756  args.so = get_scan(xd, tx_size, pd->plane_type, 0);
757
758  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
759                                         block_rd_txfm, &args);
760  if (args.skip) {
761    *rate       = INT_MAX;
762    *distortion = INT64_MAX;
763    *sse        = INT64_MAX;
764    *skippable  = 0;
765  } else {
766    *distortion = args.this_dist;
767    *rate       = args.this_rate;
768    *sse        = args.this_sse;
769    *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
770  }
771}
772
773static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
774                                     int *rate, int64_t *distortion,
775                                     int *skip, int64_t *sse,
776                                     int64_t ref_best_rd,
777                                     BLOCK_SIZE bs) {
778  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
779  VP9_COMMON *const cm = &cpi->common;
780  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
781  MACROBLOCKD *const xd = &x->e_mbd;
782  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
783
784  mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
785
786  txfm_rd_in_plane(x, rate, distortion, skip,
787                   &sse[mbmi->tx_size], ref_best_rd, 0, bs,
788                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
789  cpi->tx_stepdown_count[0]++;
790}
791
792static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
793                                     int (*r)[2], int *rate,
794                                     int64_t *d, int64_t *distortion,
795                                     int *s, int *skip,
796                                     int64_t tx_cache[TX_MODES],
797                                     BLOCK_SIZE bs) {
798  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
799  VP9_COMMON *const cm = &cpi->common;
800  MACROBLOCKD *const xd = &x->e_mbd;
801  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
802  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
803  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
804                             {INT64_MAX, INT64_MAX},
805                             {INT64_MAX, INT64_MAX},
806                             {INT64_MAX, INT64_MAX}};
807  int n, m;
808  int s0, s1;
809  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
810  int64_t best_rd = INT64_MAX;
811  TX_SIZE best_tx = TX_4X4;
812
813  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
814  assert(skip_prob > 0);
815  s0 = vp9_cost_bit(skip_prob, 0);
816  s1 = vp9_cost_bit(skip_prob, 1);
817
818  for (n = TX_4X4; n <= max_tx_size; n++) {
819    r[n][1] = r[n][0];
820    if (r[n][0] < INT_MAX) {
821      for (m = 0; m <= n - (n == max_tx_size); m++) {
822        if (m == n)
823          r[n][1] += vp9_cost_zero(tx_probs[m]);
824        else
825          r[n][1] += vp9_cost_one(tx_probs[m]);
826      }
827    }
828    if (d[n] == INT64_MAX) {
829      rd[n][0] = rd[n][1] = INT64_MAX;
830    } else if (s[n]) {
831      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
832    } else {
833      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
834      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
835    }
836
837    if (rd[n][1] < best_rd) {
838      best_tx = n;
839      best_rd = rd[n][1];
840    }
841  }
842  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
843                      best_tx : MIN(max_tx_size, max_mode_tx_size);
844
845
846  *distortion = d[mbmi->tx_size];
847  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
848  *skip       = s[mbmi->tx_size];
849
850  tx_cache[ONLY_4X4] = rd[TX_4X4][0];
851  tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
852  tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
853  tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
854
855  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
856    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
857    cpi->tx_stepdown_count[0]++;
858  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
859    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
860    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
861  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
862    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
863    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
864  } else {
865    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
866    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
867  }
868}
869
870static int64_t scaled_rd_cost(int rdmult, int rddiv,
871                              int rate, int64_t dist, double scale) {
872  return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
873}
874
875static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
876                                          int (*r)[2], int *rate,
877                                          int64_t *d, int64_t *distortion,
878                                          int *s, int *skip, int64_t *sse,
879                                          int64_t ref_best_rd,
880                                          BLOCK_SIZE bs) {
881  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
882  VP9_COMMON *const cm = &cpi->common;
883  MACROBLOCKD *const xd = &x->e_mbd;
884  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
885  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
886  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
887                             {INT64_MAX, INT64_MAX},
888                             {INT64_MAX, INT64_MAX},
889                             {INT64_MAX, INT64_MAX}};
890  int n, m;
891  int s0, s1;
892  double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
893  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
894  int64_t best_rd = INT64_MAX;
895  TX_SIZE best_tx = TX_4X4;
896
897  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
898  assert(skip_prob > 0);
899  s0 = vp9_cost_bit(skip_prob, 0);
900  s1 = vp9_cost_bit(skip_prob, 1);
901
902  for (n = TX_4X4; n <= max_tx_size; n++) {
903    double scale = scale_rd[n];
904    r[n][1] = r[n][0];
905    for (m = 0; m <= n - (n == max_tx_size); m++) {
906      if (m == n)
907        r[n][1] += vp9_cost_zero(tx_probs[m]);
908      else
909        r[n][1] += vp9_cost_one(tx_probs[m]);
910    }
911    if (s[n]) {
912      rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
913                                           scale);
914    } else {
915      rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
916                                scale);
917      rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
918                                scale);
919    }
920    if (rd[n][1] < best_rd) {
921      best_rd = rd[n][1];
922      best_tx = n;
923    }
924  }
925
926  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
927                      best_tx : MIN(max_tx_size, max_mode_tx_size);
928
929  // Actually encode using the chosen mode if a model was used, but do not
930  // update the r, d costs
931  txfm_rd_in_plane(x, rate, distortion, skip,
932                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size,
933                   cpi->sf.use_fast_coef_costing);
934
935  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
936    cpi->tx_stepdown_count[0]++;
937  } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
938    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
939  } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
940    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
941  } else {
942    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
943  }
944}
945
946static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
947                                  int64_t *distortion, int *skip,
948                                  int64_t *psse, BLOCK_SIZE bs,
949                                  int64_t txfm_cache[TX_MODES],
950                                  int64_t ref_best_rd) {
951  int r[TX_SIZES][2], s[TX_SIZES];
952  int64_t d[TX_SIZES], sse[TX_SIZES];
953  MACROBLOCKD *xd = &x->e_mbd;
954  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
955  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
956  TX_SIZE tx_size;
957
958  assert(bs == mbmi->sb_type);
959
960  vp9_subtract_plane(x, bs, 0);
961
962  if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
963    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
964    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
965                             ref_best_rd, bs);
966    if (psse)
967      *psse = sse[mbmi->tx_size];
968    return;
969  }
970
971  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
972    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
973      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
974                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
975    choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
976                                  skip, sse, ref_best_rd, bs);
977  } else {
978    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
979      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
980                       &s[tx_size], &sse[tx_size],
981                       ref_best_rd, 0, bs, tx_size,
982                       cpi->sf.use_fast_coef_costing);
983    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
984                             skip, txfm_cache, bs);
985  }
986  if (psse)
987    *psse = sse[mbmi->tx_size];
988}
989
990static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
991                                  int64_t *distortion, int *skip,
992                                  int64_t *psse, BLOCK_SIZE bs,
993                                  int64_t txfm_cache[TX_MODES],
994                                  int64_t ref_best_rd) {
995  int64_t sse[TX_SIZES];
996  MACROBLOCKD *xd = &x->e_mbd;
997  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
998
999  assert(bs == mbmi->sb_type);
1000  if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
1001    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
1002    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
1003                             ref_best_rd, bs);
1004  } else {
1005    int r[TX_SIZES][2], s[TX_SIZES];
1006    int64_t d[TX_SIZES];
1007    TX_SIZE tx_size;
1008    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
1009      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
1010                       &s[tx_size], &sse[tx_size],
1011                       ref_best_rd, 0, bs, tx_size,
1012                       cpi->sf.use_fast_coef_costing);
1013    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
1014                             skip, txfm_cache, bs);
1015  }
1016  if (psse)
1017    *psse = sse[mbmi->tx_size];
1018}
1019
1020
1021static int conditional_skipintra(MB_PREDICTION_MODE mode,
1022                                 MB_PREDICTION_MODE best_intra_mode) {
1023  if (mode == D117_PRED &&
1024      best_intra_mode != V_PRED &&
1025      best_intra_mode != D135_PRED)
1026    return 1;
1027  if (mode == D63_PRED &&
1028      best_intra_mode != V_PRED &&
1029      best_intra_mode != D45_PRED)
1030    return 1;
1031  if (mode == D207_PRED &&
1032      best_intra_mode != H_PRED &&
1033      best_intra_mode != D45_PRED)
1034    return 1;
1035  if (mode == D153_PRED &&
1036      best_intra_mode != H_PRED &&
1037      best_intra_mode != D135_PRED)
1038    return 1;
1039  return 0;
1040}
1041
1042static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
1043                                     MB_PREDICTION_MODE *best_mode,
1044                                     const int *bmode_costs,
1045                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
1046                                     int *bestrate, int *bestratey,
1047                                     int64_t *bestdistortion,
1048                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
1049  MB_PREDICTION_MODE mode;
1050  MACROBLOCKD *const xd = &x->e_mbd;
1051  int64_t best_rd = rd_thresh;
1052
1053  struct macroblock_plane *p = &x->plane[0];
1054  struct macroblockd_plane *pd = &xd->plane[0];
1055  const int src_stride = p->src.stride;
1056  const int dst_stride = pd->dst.stride;
1057  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
1058                                                            src_stride)];
1059  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
1060                                                       dst_stride)];
1061  ENTROPY_CONTEXT ta[2], tempa[2];
1062  ENTROPY_CONTEXT tl[2], templ[2];
1063
1064  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1065  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1066  int idx, idy;
1067  uint8_t best_dst[8 * 8];
1068
1069  assert(ib < 4);
1070
1071  vpx_memcpy(ta, a, sizeof(ta));
1072  vpx_memcpy(tl, l, sizeof(tl));
1073  xd->mi[0]->mbmi.tx_size = TX_4X4;
1074
1075  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1076    int64_t this_rd;
1077    int ratey = 0;
1078    int64_t distortion = 0;
1079    int rate = bmode_costs[mode];
1080
1081    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
1082      continue;
1083
1084    // Only do the oblique modes if the best so far is
1085    // one of the neighboring directional modes
1086    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
1087      if (conditional_skipintra(mode, *best_mode))
1088          continue;
1089    }
1090
1091    vpx_memcpy(tempa, ta, sizeof(ta));
1092    vpx_memcpy(templ, tl, sizeof(tl));
1093
1094    for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
1095      for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
1096        const int block = ib + idy * 2 + idx;
1097        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
1098        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
1099        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
1100                                                            p->src_diff);
1101        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
1102        xd->mi[0]->bmi[block].as_mode = mode;
1103        vp9_predict_intra_block(xd, block, 1,
1104                                TX_4X4, mode,
1105                                x->skip_encode ? src : dst,
1106                                x->skip_encode ? src_stride : dst_stride,
1107                                dst, dst_stride, idx, idy, 0);
1108        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
1109
1110        if (xd->lossless) {
1111          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1112          vp9_fwht4x4(src_diff, coeff, 8);
1113          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
1114          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
1115                               so->scan, so->neighbors,
1116                               cpi->sf.use_fast_coef_costing);
1117          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1118            goto next;
1119          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
1120                          p->eobs[block]);
1121        } else {
1122          int64_t unused;
1123          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
1124          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
1125          vp9_fht4x4(src_diff, coeff, 8, tx_type);
1126          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
1127          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
1128                             so->scan, so->neighbors,
1129                             cpi->sf.use_fast_coef_costing);
1130          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
1131                                        16, &unused) >> 2;
1132          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1133            goto next;
1134          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
1135                         dst, dst_stride, p->eobs[block]);
1136        }
1137      }
1138    }
1139
1140    rate += ratey;
1141    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
1142
1143    if (this_rd < best_rd) {
1144      *bestrate = rate;
1145      *bestratey = ratey;
1146      *bestdistortion = distortion;
1147      best_rd = this_rd;
1148      *best_mode = mode;
1149      vpx_memcpy(a, tempa, sizeof(tempa));
1150      vpx_memcpy(l, templ, sizeof(templ));
1151      for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1152        vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
1153                   num_4x4_blocks_wide * 4);
1154    }
1155  next:
1156    {}
1157  }
1158
1159  if (best_rd >= rd_thresh || x->skip_encode)
1160    return best_rd;
1161
1162  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1163    vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
1164               num_4x4_blocks_wide * 4);
1165
1166  return best_rd;
1167}
1168
1169static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
1170                                            int *rate, int *rate_y,
1171                                            int64_t *distortion,
1172                                            int64_t best_rd) {
1173  int i, j;
1174  const MACROBLOCKD *const xd = &mb->e_mbd;
1175  MODE_INFO *const mic = xd->mi[0];
1176  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
1177  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
1178  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
1179  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1180  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1181  int idx, idy;
1182  int cost = 0;
1183  int64_t total_distortion = 0;
1184  int tot_rate_y = 0;
1185  int64_t total_rd = 0;
1186  ENTROPY_CONTEXT t_above[4], t_left[4];
1187  const int *bmode_costs = mb->mbmode_cost;
1188
1189  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
1190  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1191
1192  // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1193  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1194    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1195      MB_PREDICTION_MODE best_mode = DC_PRED;
1196      int r = INT_MAX, ry = INT_MAX;
1197      int64_t d = INT64_MAX, this_rd = INT64_MAX;
1198      i = idy * 2 + idx;
1199      if (cpi->common.frame_type == KEY_FRAME) {
1200        const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
1201        const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
1202
1203        bmode_costs  = mb->y_mode_costs[A][L];
1204      }
1205
1206      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1207                                      t_above + idx, t_left + idy, &r, &ry, &d,
1208                                      bsize, best_rd - total_rd);
1209      if (this_rd >= best_rd - total_rd)
1210        return INT64_MAX;
1211
1212      total_rd += this_rd;
1213      cost += r;
1214      total_distortion += d;
1215      tot_rate_y += ry;
1216
1217      mic->bmi[i].as_mode = best_mode;
1218      for (j = 1; j < num_4x4_blocks_high; ++j)
1219        mic->bmi[i + j * 2].as_mode = best_mode;
1220      for (j = 1; j < num_4x4_blocks_wide; ++j)
1221        mic->bmi[i + j].as_mode = best_mode;
1222
1223      if (total_rd >= best_rd)
1224        return INT64_MAX;
1225    }
1226  }
1227
1228  *rate = cost;
1229  *rate_y = tot_rate_y;
1230  *distortion = total_distortion;
1231  mic->mbmi.mode = mic->bmi[3].as_mode;
1232
1233  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
1234}
1235
1236static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1237                                      int *rate, int *rate_tokenonly,
1238                                      int64_t *distortion, int *skippable,
1239                                      BLOCK_SIZE bsize,
1240                                      int64_t tx_cache[TX_MODES],
1241                                      int64_t best_rd) {
1242  MB_PREDICTION_MODE mode;
1243  MB_PREDICTION_MODE mode_selected = DC_PRED;
1244  MACROBLOCKD *const xd = &x->e_mbd;
1245  MODE_INFO *const mic = xd->mi[0];
1246  int this_rate, this_rate_tokenonly, s;
1247  int64_t this_distortion, this_rd;
1248  TX_SIZE best_tx = TX_4X4;
1249  int i;
1250  int *bmode_costs = x->mbmode_cost;
1251
1252  if (cpi->sf.tx_size_search_method == USE_FULL_RD)
1253    for (i = 0; i < TX_MODES; i++)
1254      tx_cache[i] = INT64_MAX;
1255
1256  /* Y Search for intra prediction mode */
1257  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1258    int64_t local_tx_cache[TX_MODES];
1259    MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
1260    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
1261
1262    if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
1263      continue;
1264
1265    if (cpi->common.frame_type == KEY_FRAME) {
1266      const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
1267      const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
1268
1269      bmode_costs = x->y_mode_costs[A][L];
1270    }
1271    mic->mbmi.mode = mode;
1272
1273    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
1274        &s, NULL, bsize, local_tx_cache, best_rd);
1275
1276    if (this_rate_tokenonly == INT_MAX)
1277      continue;
1278
1279    this_rate = this_rate_tokenonly + bmode_costs[mode];
1280    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1281
1282    if (this_rd < best_rd) {
1283      mode_selected   = mode;
1284      best_rd         = this_rd;
1285      best_tx         = mic->mbmi.tx_size;
1286      *rate           = this_rate;
1287      *rate_tokenonly = this_rate_tokenonly;
1288      *distortion     = this_distortion;
1289      *skippable      = s;
1290    }
1291
1292    if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1293      for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
1294        const int64_t adj_rd = this_rd + local_tx_cache[i] -
1295            local_tx_cache[cpi->common.tx_mode];
1296        if (adj_rd < tx_cache[i]) {
1297          tx_cache[i] = adj_rd;
1298        }
1299      }
1300    }
1301  }
1302
1303  mic->mbmi.mode = mode_selected;
1304  mic->mbmi.tx_size = best_tx;
1305
1306  return best_rd;
1307}
1308
1309static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
1310                             int *rate, int64_t *distortion, int *skippable,
1311                             int64_t *sse, BLOCK_SIZE bsize,
1312                             int64_t ref_best_rd) {
1313  MACROBLOCKD *const xd = &x->e_mbd;
1314  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1315  TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
1316  int plane;
1317  int pnrate = 0, pnskip = 1;
1318  int64_t pndist = 0, pnsse = 0;
1319
1320  if (ref_best_rd < 0)
1321    goto term;
1322
1323  if (is_inter_block(mbmi)) {
1324    int plane;
1325    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
1326      vp9_subtract_plane(x, bsize, plane);
1327  }
1328
1329  *rate = 0;
1330  *distortion = 0;
1331  *sse = 0;
1332  *skippable = 1;
1333
1334  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
1335    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
1336                     ref_best_rd, plane, bsize, uv_txfm_size,
1337                     cpi->sf.use_fast_coef_costing);
1338    if (pnrate == INT_MAX)
1339      goto term;
1340    *rate += pnrate;
1341    *distortion += pndist;
1342    *sse += pnsse;
1343    *skippable &= pnskip;
1344  }
1345  return;
1346
1347  term:
1348  *rate = INT_MAX;
1349  *distortion = INT64_MAX;
1350  *sse = INT64_MAX;
1351  *skippable = 0;
1352  return;
1353}
1354
1355static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1356                                       PICK_MODE_CONTEXT *ctx,
1357                                       int *rate, int *rate_tokenonly,
1358                                       int64_t *distortion, int *skippable,
1359                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
1360  MACROBLOCKD *xd = &x->e_mbd;
1361  MB_PREDICTION_MODE mode;
1362  MB_PREDICTION_MODE mode_selected = DC_PRED;
1363  int64_t best_rd = INT64_MAX, this_rd;
1364  int this_rate_tokenonly, this_rate, s;
1365  int64_t this_distortion, this_sse;
1366
1367  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1368    if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
1369      continue;
1370
1371    xd->mi[0]->mbmi.uv_mode = mode;
1372
1373    super_block_uvrd(cpi, x, &this_rate_tokenonly,
1374                     &this_distortion, &s, &this_sse, bsize, best_rd);
1375    if (this_rate_tokenonly == INT_MAX)
1376      continue;
1377    this_rate = this_rate_tokenonly +
1378                x->intra_uv_mode_cost[cpi->common.frame_type][mode];
1379    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1380
1381    if (this_rd < best_rd) {
1382      mode_selected   = mode;
1383      best_rd         = this_rd;
1384      *rate           = this_rate;
1385      *rate_tokenonly = this_rate_tokenonly;
1386      *distortion     = this_distortion;
1387      *skippable      = s;
1388      if (!x->select_txfm_size) {
1389        int i;
1390        struct macroblock_plane *const p = x->plane;
1391        struct macroblockd_plane *const pd = xd->plane;
1392        for (i = 1; i < MAX_MB_PLANE; ++i) {
1393          p[i].coeff    = ctx->coeff_pbuf[i][2];
1394          p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
1395          pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
1396          p[i].eobs    = ctx->eobs_pbuf[i][2];
1397
1398          ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
1399          ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
1400          ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
1401          ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
1402
1403          ctx->coeff_pbuf[i][0]   = p[i].coeff;
1404          ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
1405          ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
1406          ctx->eobs_pbuf[i][0]    = p[i].eobs;
1407        }
1408      }
1409    }
1410  }
1411
1412  xd->mi[0]->mbmi.uv_mode = mode_selected;
1413  return best_rd;
1414}
1415
1416static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
1417                              int *rate, int *rate_tokenonly,
1418                              int64_t *distortion, int *skippable,
1419                              BLOCK_SIZE bsize) {
1420  const VP9_COMMON *cm = &cpi->common;
1421  int64_t unused;
1422
1423  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
1424  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
1425                   skippable, &unused, bsize, INT64_MAX);
1426  *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
1427  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1428}
1429
1430static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
1431                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
1432                                 int *rate_uv, int *rate_uv_tokenonly,
1433                                 int64_t *dist_uv, int *skip_uv,
1434                                 MB_PREDICTION_MODE *mode_uv) {
1435  MACROBLOCK *const x = &cpi->mb;
1436
1437  // Use an estimated rd for uv_intra based on DC_PRED if the
1438  // appropriate speed flag is set.
1439  if (cpi->sf.use_uv_intra_rd_estimate) {
1440    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
1441                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
1442  // Else do a proper rd search for each possible transform size that may
1443  // be considered in the main rd loop.
1444  } else {
1445    rd_pick_intra_sbuv_mode(cpi, x, ctx,
1446                            rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1447                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
1448  }
1449  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
1450}
1451
1452static int cost_mv_ref(const VP9_COMP *cpi, MB_PREDICTION_MODE mode,
1453                       int mode_context) {
1454  const MACROBLOCK *const x = &cpi->mb;
1455  const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
1456
1457  // Don't account for mode here if segment skip is enabled.
1458  if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
1459    assert(is_inter_mode(mode));
1460    return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1461  } else {
1462    return 0;
1463  }
1464}
1465
1466static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1467                                BLOCK_SIZE bsize,
1468                                int_mv *frame_mv,
1469                                int mi_row, int mi_col,
1470                                int_mv single_newmv[MAX_REF_FRAMES],
1471                                int *rate_mv);
1472
1473static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
1474                       MB_PREDICTION_MODE mode,
1475                       int_mv this_mv[2],
1476                       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1477                       int_mv seg_mvs[MAX_REF_FRAMES],
1478                       int_mv *best_ref_mv[2],
1479                       const int *mvjcost, int *mvcost[2]) {
1480  MODE_INFO *const mic = xd->mi[0];
1481  const MB_MODE_INFO *const mbmi = &mic->mbmi;
1482  int thismvcost = 0;
1483  int idx, idy;
1484  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1485  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1486  const int is_compound = has_second_ref(mbmi);
1487
1488  // the only time we should do costing for new motion vector or mode
1489  // is when we are on a new label  (jbb May 08, 2007)
1490  switch (mode) {
1491    case NEWMV:
1492      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1493      thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
1494                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1495      if (is_compound) {
1496        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1497        thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
1498                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1499      }
1500      break;
1501    case NEARESTMV:
1502      this_mv[0].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
1503      if (is_compound)
1504        this_mv[1].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
1505      break;
1506    case NEARMV:
1507      this_mv[0].as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
1508      if (is_compound)
1509        this_mv[1].as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
1510      break;
1511    case ZEROMV:
1512      this_mv[0].as_int = 0;
1513      if (is_compound)
1514        this_mv[1].as_int = 0;
1515      break;
1516    default:
1517      break;
1518  }
1519
1520  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
1521  if (is_compound)
1522    mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
1523
1524  mic->bmi[i].as_mode = mode;
1525
1526  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1527    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1528      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1529                 &mic->bmi[i], sizeof(mic->bmi[i]));
1530
1531  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
1532            thismvcost;
1533}
1534
1535static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1536                                       MACROBLOCK *x,
1537                                       int64_t best_yrd,
1538                                       int i,
1539                                       int *labelyrate,
1540                                       int64_t *distortion, int64_t *sse,
1541                                       ENTROPY_CONTEXT *ta,
1542                                       ENTROPY_CONTEXT *tl,
1543                                       int mi_row, int mi_col) {
1544  int k;
1545  MACROBLOCKD *xd = &x->e_mbd;
1546  struct macroblockd_plane *const pd = &xd->plane[0];
1547  struct macroblock_plane *const p = &x->plane[0];
1548  MODE_INFO *const mi = xd->mi[0];
1549  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1550  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1551  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1552  int idx, idy;
1553
1554  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
1555                                                             p->src.stride)];
1556  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
1557                                                        pd->dst.stride)];
1558  int64_t thisdistortion = 0, thissse = 0;
1559  int thisrate = 0, ref;
1560  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1561  const int is_compound = has_second_ref(&mi->mbmi);
1562  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
1563
1564  for (ref = 0; ref < 1 + is_compound; ++ref) {
1565    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
1566                                               pd->pre[ref].stride)];
1567    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1568                              dst, pd->dst.stride,
1569                              &mi->bmi[i].as_mv[ref].as_mv,
1570                              &xd->block_refs[ref]->sf, width, height, ref,
1571                              kernel, MV_PRECISION_Q3,
1572                              mi_col * MI_SIZE + 4 * (i % 2),
1573                              mi_row * MI_SIZE + 4 * (i / 2));
1574  }
1575
1576  vp9_subtract_block(height, width,
1577                     raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1578                     src, p->src.stride,
1579                     dst, pd->dst.stride);
1580
1581  k = i;
1582  for (idy = 0; idy < height / 4; ++idy) {
1583    for (idx = 0; idx < width / 4; ++idx) {
1584      int64_t ssz, rd, rd1, rd2;
1585      int16_t* coeff;
1586
1587      k += (idy * 2 + idx);
1588      coeff = BLOCK_OFFSET(p->coeff, k);
1589      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1590                    coeff, 8);
1591      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1592      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1593                                        16, &ssz);
1594      thissse += ssz;
1595      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1596                              so->scan, so->neighbors,
1597                              cpi->sf.use_fast_coef_costing);
1598      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1599      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1600      rd = MIN(rd1, rd2);
1601      if (rd >= best_yrd)
1602        return INT64_MAX;
1603    }
1604  }
1605
1606  *distortion = thisdistortion >> 2;
1607  *labelyrate = thisrate;
1608  *sse = thissse >> 2;
1609
1610  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1611}
1612
1613typedef struct {
1614  int eobs;
1615  int brate;
1616  int byrate;
1617  int64_t bdist;
1618  int64_t bsse;
1619  int64_t brdcost;
1620  int_mv mvs[2];
1621  ENTROPY_CONTEXT ta[2];
1622  ENTROPY_CONTEXT tl[2];
1623} SEG_RDSTAT;
1624
1625typedef struct {
1626  int_mv *ref_mv[2];
1627  int_mv mvp;
1628
1629  int64_t segment_rd;
1630  int r;
1631  int64_t d;
1632  int64_t sse;
1633  int segment_yrate;
1634  MB_PREDICTION_MODE modes[4];
1635  SEG_RDSTAT rdstat[4][INTER_MODES];
1636  int mvthresh;
1637} BEST_SEG_INFO;
1638
1639static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1640  return (mv->row >> 3) < x->mv_row_min ||
1641         (mv->row >> 3) > x->mv_row_max ||
1642         (mv->col >> 3) < x->mv_col_min ||
1643         (mv->col >> 3) > x->mv_col_max;
1644}
1645
1646static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1647  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
1648  struct macroblock_plane *const p = &x->plane[0];
1649  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1650
1651  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1652  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1653  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
1654                                                       pd->pre[0].stride)];
1655  if (has_second_ref(mbmi))
1656    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
1657                                                         pd->pre[1].stride)];
1658}
1659
1660static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1661                                  struct buf_2d orig_pre[2]) {
1662  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
1663  x->plane[0].src = orig_src;
1664  x->e_mbd.plane[0].pre[0] = orig_pre[0];
1665  if (has_second_ref(mbmi))
1666    x->e_mbd.plane[0].pre[1] = orig_pre[1];
1667}
1668
1669static INLINE int mv_has_subpel(const MV *mv) {
1670  return (mv->row & 0x0F) || (mv->col & 0x0F);
1671}
1672
1673// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
1674// TODO(aconverse): Find out if this is still productive then clean up or remove
1675static int check_best_zero_mv(
1676    const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
1677    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1678    int disable_inter_mode_mask, int this_mode, int ref_frame,
1679    int second_ref_frame) {
1680  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
1681      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
1682      frame_mv[this_mode][ref_frame].as_int == 0 &&
1683      (second_ref_frame == NONE ||
1684       frame_mv[this_mode][second_ref_frame].as_int == 0)) {
1685    int rfc = mode_context[ref_frame];
1686    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1687    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1688    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1689
1690    if (this_mode == NEARMV) {
1691      if (c1 > c3) return 0;
1692    } else if (this_mode == NEARESTMV) {
1693      if (c2 > c3) return 0;
1694    } else {
1695      assert(this_mode == ZEROMV);
1696      if (second_ref_frame == NONE) {
1697        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0) ||
1698            (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0))
1699          return 0;
1700      } else {
1701        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0 &&
1702             frame_mv[NEARESTMV][second_ref_frame].as_int == 0) ||
1703            (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0 &&
1704             frame_mv[NEARMV][second_ref_frame].as_int == 0))
1705          return 0;
1706      }
1707    }
1708  }
1709  return 1;
1710}
1711
1712static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
1713                                    const TileInfo *const tile,
1714                                    BEST_SEG_INFO *bsi_buf, int filter_idx,
1715                                    int_mv seg_mvs[4][MAX_REF_FRAMES],
1716                                    int mi_row, int mi_col) {
1717  int k, br = 0, idx, idy;
1718  int64_t bd = 0, block_sse = 0;
1719  MB_PREDICTION_MODE this_mode;
1720  MACROBLOCKD *xd = &x->e_mbd;
1721  VP9_COMMON *cm = &cpi->common;
1722  MODE_INFO *mi = xd->mi[0];
1723  MB_MODE_INFO *const mbmi = &mi->mbmi;
1724  struct macroblock_plane *const p = &x->plane[0];
1725  struct macroblockd_plane *const pd = &xd->plane[0];
1726  const int label_count = 4;
1727  int64_t this_segment_rd = 0;
1728  int label_mv_thresh;
1729  int segmentyrate = 0;
1730  const BLOCK_SIZE bsize = mbmi->sb_type;
1731  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1732  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1733  vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize];
1734  ENTROPY_CONTEXT t_above[2], t_left[2];
1735  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1736  int mode_idx;
1737  int subpelmv = 1, have_ref = 0;
1738  const int has_second_rf = has_second_ref(mbmi);
1739  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
1740
1741  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
1742  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
1743
1744  // 64 makes this threshold really big effectively
1745  // making it so that we very rarely check mvs on
1746  // segments.   setting this to 1 would make mv thresh
1747  // roughly equal to what it is for macroblocks
1748  label_mv_thresh = 1 * bsi->mvthresh / label_count;
1749
1750  // Segmentation method overheads
1751  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1752    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1753      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1754      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1755      int_mv mode_mv[MB_MODE_COUNT][2];
1756      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1757      MB_PREDICTION_MODE mode_selected = ZEROMV;
1758      int64_t best_rd = INT64_MAX;
1759      const int i = idy * 2 + idx;
1760      int ref;
1761
1762      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1763        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1764        frame_mv[ZEROMV][frame].as_int = 0;
1765        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1766                                      &frame_mv[NEARESTMV][frame],
1767                                      &frame_mv[NEARMV][frame]);
1768      }
1769
1770      // search for the best motion vector on this segment
1771      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1772        const struct buf_2d orig_src = x->plane[0].src;
1773        struct buf_2d orig_pre[2];
1774
1775        mode_idx = INTER_OFFSET(this_mode);
1776        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1777        if (disable_inter_mode_mask & (1 << mode_idx))
1778          continue;
1779
1780        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
1781                                disable_inter_mode_mask,
1782                                this_mode, mbmi->ref_frame[0],
1783                                mbmi->ref_frame[1]))
1784          continue;
1785
1786        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1787        vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1788                   sizeof(bsi->rdstat[i][mode_idx].ta));
1789        vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1790                   sizeof(bsi->rdstat[i][mode_idx].tl));
1791
1792        // motion search for newmv (single predictor case only)
1793        if (!has_second_rf && this_mode == NEWMV &&
1794            seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1795          int_mv *const new_mv = &mode_mv[NEWMV][0];
1796          int step_param = 0;
1797          int further_steps;
1798          int thissme, bestsme = INT_MAX;
1799          int sadpb = x->sadperbit4;
1800          MV mvp_full;
1801          int max_mv;
1802
1803          /* Is the best so far sufficiently good that we cant justify doing
1804           * and new motion search. */
1805          if (best_rd < label_mv_thresh)
1806            break;
1807
1808          if (cpi->oxcf.mode != MODE_SECONDPASS_BEST &&
1809              cpi->oxcf.mode != MODE_BESTQUALITY) {
1810            // use previous block's result as next block's MV predictor.
1811            if (i > 0) {
1812              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1813              if (i == 2)
1814                bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1815            }
1816          }
1817          if (i == 0)
1818            max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1819          else
1820            max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1821
1822          if (cpi->sf.auto_mv_step_size && cm->show_frame) {
1823            // Take wtd average of the step_params based on the last frame's
1824            // max mv magnitude and the best ref mvs of the current block for
1825            // the given reference.
1826            step_param = (vp9_init_search_range(cpi, max_mv) +
1827                          cpi->mv_step_param) >> 1;
1828          } else {
1829            step_param = cpi->mv_step_param;
1830          }
1831
1832          mvp_full.row = bsi->mvp.as_mv.row >> 3;
1833          mvp_full.col = bsi->mvp.as_mv.col >> 3;
1834
1835          if (cpi->sf.adaptive_motion_search && cm->show_frame) {
1836            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
1837            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
1838            step_param = MAX(step_param, 8);
1839          }
1840
1841          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
1842          // adjust src pointer for this block
1843          mi_buf_shift(x, i);
1844
1845          vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
1846
1847          if (cpi->sf.search_method == HEX) {
1848            bestsme = vp9_hex_search(x, &mvp_full,
1849                                     step_param,
1850                                     sadpb, 1, v_fn_ptr, 1,
1851                                     &bsi->ref_mv[0]->as_mv,
1852                                     &new_mv->as_mv);
1853            if (bestsme < INT_MAX)
1854              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
1855                                           &bsi->ref_mv[0]->as_mv,
1856                                           v_fn_ptr, 1);
1857          } else if (cpi->sf.search_method == SQUARE) {
1858            bestsme = vp9_square_search(x, &mvp_full,
1859                                        step_param,
1860                                        sadpb, 1, v_fn_ptr, 1,
1861                                        &bsi->ref_mv[0]->as_mv,
1862                                        &new_mv->as_mv);
1863            if (bestsme < INT_MAX)
1864              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
1865                                           &bsi->ref_mv[0]->as_mv,
1866                                           v_fn_ptr, 1);
1867          } else if (cpi->sf.search_method == BIGDIA) {
1868            bestsme = vp9_bigdia_search(x, &mvp_full,
1869                                        step_param,
1870                                        sadpb, 1, v_fn_ptr, 1,
1871                                        &bsi->ref_mv[0]->as_mv,
1872                                        &new_mv->as_mv);
1873            if (bestsme < INT_MAX)
1874              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
1875                                           &bsi->ref_mv[0]->as_mv,
1876                                           v_fn_ptr, 1);
1877          } else {
1878            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
1879                                             sadpb, further_steps, 0, v_fn_ptr,
1880                                             &bsi->ref_mv[0]->as_mv,
1881                                             &new_mv->as_mv);
1882          }
1883
1884          // Should we do a full search (best quality only)
1885          if (cpi->oxcf.mode == MODE_BESTQUALITY ||
1886              cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
1887            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
1888            /* Check if mvp_full is within the range. */
1889            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1890                     x->mv_row_min, x->mv_row_max);
1891            thissme = cpi->full_search_sad(x, &mvp_full,
1892                                           sadpb, 16, v_fn_ptr,
1893                                           x->nmvjointcost, x->mvcost,
1894                                           &bsi->ref_mv[0]->as_mv,
1895                                           &best_mv->as_mv);
1896            if (thissme < bestsme) {
1897              bestsme = thissme;
1898              new_mv->as_int = best_mv->as_int;
1899            } else {
1900              // The full search result is actually worse so re-instate the
1901              // previous best vector
1902              best_mv->as_int = new_mv->as_int;
1903            }
1904          }
1905
1906          if (bestsme < INT_MAX) {
1907            int distortion;
1908            cpi->find_fractional_mv_step(x,
1909                                         &new_mv->as_mv,
1910                                         &bsi->ref_mv[0]->as_mv,
1911                                         cm->allow_high_precision_mv,
1912                                         x->errorperbit, v_fn_ptr,
1913                                         cpi->sf.subpel_force_stop,
1914                                         cpi->sf.subpel_iters_per_step,
1915                                         x->nmvjointcost, x->mvcost,
1916                                         &distortion,
1917                                         &x->pred_sse[mbmi->ref_frame[0]]);
1918
1919            // save motion search result for use in compound prediction
1920            seg_mvs[i][mbmi->ref_frame[0]].as_int = new_mv->as_int;
1921          }
1922
1923          if (cpi->sf.adaptive_motion_search)
1924            x->pred_mv[mbmi->ref_frame[0]].as_int = new_mv->as_int;
1925
1926          // restore src pointers
1927          mi_buf_restore(x, orig_src, orig_pre);
1928        }
1929
1930        if (has_second_rf) {
1931          if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1932              seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1933            continue;
1934        }
1935
1936        if (has_second_rf && this_mode == NEWMV &&
1937            mbmi->interp_filter == EIGHTTAP) {
1938          // adjust src pointers
1939          mi_buf_shift(x, i);
1940          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1941            int rate_mv;
1942            joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1943                                mi_row, mi_col, seg_mvs[i],
1944                                &rate_mv);
1945            seg_mvs[i][mbmi->ref_frame[0]].as_int =
1946                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1947            seg_mvs[i][mbmi->ref_frame[1]].as_int =
1948                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1949          }
1950          // restore src pointers
1951          mi_buf_restore(x, orig_src, orig_pre);
1952        }
1953
1954        bsi->rdstat[i][mode_idx].brate =
1955            labels2mode(cpi, xd, i, this_mode, mode_mv[this_mode], frame_mv,
1956                        seg_mvs[i], bsi->ref_mv, x->nmvjointcost, x->mvcost);
1957
1958        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1959          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
1960              mode_mv[this_mode][ref].as_int;
1961          if (num_4x4_blocks_wide > 1)
1962            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
1963                mode_mv[this_mode][ref].as_int;
1964          if (num_4x4_blocks_high > 1)
1965            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
1966                mode_mv[this_mode][ref].as_int;
1967        }
1968
1969        // Trap vectors that reach beyond the UMV borders
1970        if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
1971            (has_second_rf &&
1972             mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
1973          continue;
1974
1975        if (filter_idx > 0) {
1976          BEST_SEG_INFO *ref_bsi = bsi_buf;
1977          subpelmv = 0;
1978          have_ref = 1;
1979
1980          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1981            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
1982            have_ref &= mode_mv[this_mode][ref].as_int ==
1983                ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1984          }
1985
1986          if (filter_idx > 1 && !subpelmv && !have_ref) {
1987            ref_bsi = bsi_buf + 1;
1988            have_ref = 1;
1989            for (ref = 0; ref < 1 + has_second_rf; ++ref)
1990              have_ref &= mode_mv[this_mode][ref].as_int ==
1991                  ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1992          }
1993
1994          if (!subpelmv && have_ref &&
1995              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1996            vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
1997                       sizeof(SEG_RDSTAT));
1998            if (num_4x4_blocks_wide > 1)
1999              bsi->rdstat[i + 1][mode_idx].eobs =
2000                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
2001            if (num_4x4_blocks_high > 1)
2002              bsi->rdstat[i + 2][mode_idx].eobs =
2003                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
2004
2005            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2006              mode_selected = this_mode;
2007              best_rd = bsi->rdstat[i][mode_idx].brdcost;
2008            }
2009            continue;
2010          }
2011        }
2012
2013        bsi->rdstat[i][mode_idx].brdcost =
2014            encode_inter_mb_segment(cpi, x,
2015                                    bsi->segment_rd - this_segment_rd, i,
2016                                    &bsi->rdstat[i][mode_idx].byrate,
2017                                    &bsi->rdstat[i][mode_idx].bdist,
2018                                    &bsi->rdstat[i][mode_idx].bsse,
2019                                    bsi->rdstat[i][mode_idx].ta,
2020                                    bsi->rdstat[i][mode_idx].tl,
2021                                    mi_row, mi_col);
2022        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
2023          bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
2024                                            bsi->rdstat[i][mode_idx].brate, 0);
2025          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
2026          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
2027          if (num_4x4_blocks_wide > 1)
2028            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
2029          if (num_4x4_blocks_high > 1)
2030            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
2031        }
2032
2033        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2034          mode_selected = this_mode;
2035          best_rd = bsi->rdstat[i][mode_idx].brdcost;
2036        }
2037      } /*for each 4x4 mode*/
2038
2039      if (best_rd == INT64_MAX) {
2040        int iy, midx;
2041        for (iy = i + 1; iy < 4; ++iy)
2042          for (midx = 0; midx < INTER_MODES; ++midx)
2043            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2044        bsi->segment_rd = INT64_MAX;
2045        return;
2046      }
2047
2048      mode_idx = INTER_OFFSET(mode_selected);
2049      vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
2050      vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
2051
2052      labels2mode(cpi, xd, i, mode_selected, mode_mv[mode_selected],
2053                  frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
2054                  x->mvcost);
2055
2056      br += bsi->rdstat[i][mode_idx].brate;
2057      bd += bsi->rdstat[i][mode_idx].bdist;
2058      block_sse += bsi->rdstat[i][mode_idx].bsse;
2059      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
2060      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
2061
2062      if (this_segment_rd > bsi->segment_rd) {
2063        int iy, midx;
2064        for (iy = i + 1; iy < 4; ++iy)
2065          for (midx = 0; midx < INTER_MODES; ++midx)
2066            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2067        bsi->segment_rd = INT64_MAX;
2068        return;
2069      }
2070    }
2071  } /* for each label */
2072
2073  bsi->r = br;
2074  bsi->d = bd;
2075  bsi->segment_yrate = segmentyrate;
2076  bsi->segment_rd = this_segment_rd;
2077  bsi->sse = block_sse;
2078
2079  // update the coding decisions
2080  for (k = 0; k < 4; ++k)
2081    bsi->modes[k] = mi->bmi[k].as_mode;
2082}
2083
2084static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
2085                                           const TileInfo *const tile,
2086                                           int_mv *best_ref_mv,
2087                                           int_mv *second_best_ref_mv,
2088                                           int64_t best_rd,
2089                                           int *returntotrate,
2090                                           int *returnyrate,
2091                                           int64_t *returndistortion,
2092                                           int *skippable, int64_t *psse,
2093                                           int mvthresh,
2094                                           int_mv seg_mvs[4][MAX_REF_FRAMES],
2095                                           BEST_SEG_INFO *bsi_buf,
2096                                           int filter_idx,
2097                                           int mi_row, int mi_col) {
2098  int i;
2099  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
2100  MACROBLOCKD *xd = &x->e_mbd;
2101  MODE_INFO *mi = xd->mi[0];
2102  MB_MODE_INFO *mbmi = &mi->mbmi;
2103  int mode_idx;
2104
2105  vp9_zero(*bsi);
2106
2107  bsi->segment_rd = best_rd;
2108  bsi->ref_mv[0] = best_ref_mv;
2109  bsi->ref_mv[1] = second_best_ref_mv;
2110  bsi->mvp.as_int = best_ref_mv->as_int;
2111  bsi->mvthresh = mvthresh;
2112
2113  for (i = 0; i < 4; i++)
2114    bsi->modes[i] = ZEROMV;
2115
2116  rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
2117                          mi_row, mi_col);
2118
2119  if (bsi->segment_rd > best_rd)
2120    return INT64_MAX;
2121  /* set it to the best */
2122  for (i = 0; i < 4; i++) {
2123    mode_idx = INTER_OFFSET(bsi->modes[i]);
2124    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
2125    if (has_second_ref(mbmi))
2126      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
2127    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
2128    mi->bmi[i].as_mode = bsi->modes[i];
2129  }
2130
2131  /*
2132   * used to set mbmi->mv.as_int
2133   */
2134  *returntotrate = bsi->r;
2135  *returndistortion = bsi->d;
2136  *returnyrate = bsi->segment_yrate;
2137  *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
2138  *psse = bsi->sse;
2139  mbmi->mode = bsi->modes[3];
2140
2141  return bsi->segment_rd;
2142}
2143
2144static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
2145                    uint8_t *ref_y_buffer, int ref_y_stride,
2146                    int ref_frame, BLOCK_SIZE block_size ) {
2147  MACROBLOCKD *xd = &x->e_mbd;
2148  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2149  int_mv this_mv;
2150  int i;
2151  int zero_seen = 0;
2152  int best_index = 0;
2153  int best_sad = INT_MAX;
2154  int this_sad = INT_MAX;
2155  int max_mv = 0;
2156
2157  uint8_t *src_y_ptr = x->plane[0].src.buf;
2158  uint8_t *ref_y_ptr;
2159  int row_offset, col_offset;
2160  int num_mv_refs = MAX_MV_REF_CANDIDATES +
2161                    (cpi->sf.adaptive_motion_search &&
2162                     cpi->common.show_frame &&
2163                     block_size < cpi->sf.max_partition_size);
2164
2165  int_mv pred_mv[3];
2166  pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
2167  pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
2168  pred_mv[2] = x->pred_mv[ref_frame];
2169
2170  // Get the sad for each candidate reference mv
2171  for (i = 0; i < num_mv_refs; i++) {
2172    this_mv.as_int = pred_mv[i].as_int;
2173
2174    max_mv = MAX(max_mv,
2175                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
2176    // only need to check zero mv once
2177    if (!this_mv.as_int && zero_seen)
2178      continue;
2179
2180    zero_seen = zero_seen || !this_mv.as_int;
2181
2182    row_offset = this_mv.as_mv.row >> 3;
2183    col_offset = this_mv.as_mv.col >> 3;
2184    ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
2185
2186    // Find sad for current vector.
2187    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
2188                                           ref_y_ptr, ref_y_stride,
2189                                           0x7fffffff);
2190
2191    // Note if it is the best so far.
2192    if (this_sad < best_sad) {
2193      best_sad = this_sad;
2194      best_index = i;
2195    }
2196  }
2197
2198  // Note the index of the mv that worked best in the reference list.
2199  x->mv_best_ref_index[ref_frame] = best_index;
2200  x->max_mv_context[ref_frame] = max_mv;
2201  x->pred_mv_sad[ref_frame] = best_sad;
2202}
2203
2204static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
2205                                     unsigned int *ref_costs_single,
2206                                     unsigned int *ref_costs_comp,
2207                                     vp9_prob *comp_mode_p) {
2208  VP9_COMMON *const cm = &cpi->common;
2209  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
2210  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
2211                                             SEG_LVL_REF_FRAME);
2212  if (seg_ref_active) {
2213    vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
2214    vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
2215    *comp_mode_p = 128;
2216  } else {
2217    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
2218    vp9_prob comp_inter_p = 128;
2219
2220    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
2221      comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
2222      *comp_mode_p = comp_inter_p;
2223    } else {
2224      *comp_mode_p = 128;
2225    }
2226
2227    ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
2228
2229    if (cm->reference_mode != COMPOUND_REFERENCE) {
2230      vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
2231      vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
2232      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2233
2234      if (cm->reference_mode == REFERENCE_MODE_SELECT)
2235        base_cost += vp9_cost_bit(comp_inter_p, 0);
2236
2237      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
2238          ref_costs_single[ALTREF_FRAME] = base_cost;
2239      ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
2240      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2241      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2242      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
2243      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
2244    } else {
2245      ref_costs_single[LAST_FRAME]   = 512;
2246      ref_costs_single[GOLDEN_FRAME] = 512;
2247      ref_costs_single[ALTREF_FRAME] = 512;
2248    }
2249    if (cm->reference_mode != SINGLE_REFERENCE) {
2250      vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
2251      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2252
2253      if (cm->reference_mode == REFERENCE_MODE_SELECT)
2254        base_cost += vp9_cost_bit(comp_inter_p, 1);
2255
2256      ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
2257      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
2258    } else {
2259      ref_costs_comp[LAST_FRAME]   = 512;
2260      ref_costs_comp[GOLDEN_FRAME] = 512;
2261    }
2262  }
2263}
2264
2265static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2266                         int mode_index,
2267                         int_mv *ref_mv,
2268                         int_mv *second_ref_mv,
2269                         int64_t comp_pred_diff[REFERENCE_MODES],
2270                         int64_t tx_size_diff[TX_MODES],
2271                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
2272  MACROBLOCKD *const xd = &x->e_mbd;
2273
2274  // Take a snapshot of the coding context so it can be
2275  // restored if we decide to encode this way
2276  ctx->skip = x->skip;
2277  ctx->best_mode_index = mode_index;
2278  ctx->mic = *xd->mi[0];
2279
2280  ctx->best_ref_mv[0].as_int = ref_mv->as_int;
2281  ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
2282
2283  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
2284  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
2285  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
2286
2287  vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
2288  vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
2289             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
2290}
2291
2292static void setup_pred_block(const MACROBLOCKD *xd,
2293                             struct buf_2d dst[MAX_MB_PLANE],
2294                             const YV12_BUFFER_CONFIG *src,
2295                             int mi_row, int mi_col,
2296                             const struct scale_factors *scale,
2297                             const struct scale_factors *scale_uv) {
2298  int i;
2299
2300  dst[0].buf = src->y_buffer;
2301  dst[0].stride = src->y_stride;
2302  dst[1].buf = src->u_buffer;
2303  dst[2].buf = src->v_buffer;
2304  dst[1].stride = dst[2].stride = src->uv_stride;
2305#if CONFIG_ALPHA
2306  dst[3].buf = src->alpha_buffer;
2307  dst[3].stride = src->alpha_stride;
2308#endif
2309
2310  // TODO(jkoleszar): Make scale factors per-plane data
2311  for (i = 0; i < MAX_MB_PLANE; i++) {
2312    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
2313                     i ? scale_uv : scale,
2314                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
2315  }
2316}
2317
2318void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
2319                            const TileInfo *const tile,
2320                            MV_REFERENCE_FRAME ref_frame,
2321                            BLOCK_SIZE block_size,
2322                            int mi_row, int mi_col,
2323                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
2324                            int_mv frame_near_mv[MAX_REF_FRAMES],
2325                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
2326  const VP9_COMMON *cm = &cpi->common;
2327  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
2328  MACROBLOCKD *const xd = &x->e_mbd;
2329  MODE_INFO *const mi = xd->mi[0];
2330  int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
2331  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
2332
2333  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2334  // use the UV scaling factors.
2335  setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
2336
2337  // Gets an initial list of candidate vectors from neighbours and orders them
2338  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
2339
2340  // Candidate refinement carried out at encoder and decoder
2341  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
2342                        &frame_nearest_mv[ref_frame],
2343                        &frame_near_mv[ref_frame]);
2344
2345  // Further refinement that is encode side only to test the top few candidates
2346  // in full and choose the best as the centre point for subsequent searches.
2347  // The current implementation doesn't support scaling.
2348  if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
2349    mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
2350            ref_frame, block_size);
2351}
2352
2353const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
2354                                                   int ref_frame) {
2355  const VP9_COMMON *const cm = &cpi->common;
2356  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
2357  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
2358  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
2359}
2360
2361int vp9_get_switchable_rate(const MACROBLOCK *x) {
2362  const MACROBLOCKD *const xd = &x->e_mbd;
2363  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2364  const int ctx = vp9_get_pred_context_switchable_interp(xd);
2365  return SWITCHABLE_INTERP_RATE_FACTOR *
2366             x->switchable_interp_costs[ctx][mbmi->interp_filter];
2367}
2368
2369static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2370                                 const TileInfo *const tile,
2371                                 BLOCK_SIZE bsize,
2372                                 int mi_row, int mi_col,
2373                                 int_mv *tmp_mv, int *rate_mv) {
2374  MACROBLOCKD *xd = &x->e_mbd;
2375  VP9_COMMON *cm = &cpi->common;
2376  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2377  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
2378  int bestsme = INT_MAX;
2379  int further_steps, step_param;
2380  int sadpb = x->sadperbit16;
2381  MV mvp_full;
2382  int ref = mbmi->ref_frame[0];
2383  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
2384
2385  int tmp_col_min = x->mv_col_min;
2386  int tmp_col_max = x->mv_col_max;
2387  int tmp_row_min = x->mv_row_min;
2388  int tmp_row_max = x->mv_row_max;
2389
2390  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
2391                                                                        ref);
2392
2393  MV pred_mv[3];
2394  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
2395  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
2396  pred_mv[2] = x->pred_mv[ref].as_mv;
2397
2398  if (scaled_ref_frame) {
2399    int i;
2400    // Swap out the reference frame for a version that's been scaled to
2401    // match the resolution of the current frame, allowing the existing
2402    // motion search code to be used without additional modifications.
2403    for (i = 0; i < MAX_MB_PLANE; i++)
2404      backup_yv12[i] = xd->plane[i].pre[0];
2405
2406    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2407  }
2408
2409  vp9_set_mv_search_range(x, &ref_mv);
2410
2411  // Work out the size of the first step in the mv step search.
2412  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2413  if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
2414    // Take wtd average of the step_params based on the last frame's
2415    // max mv magnitude and that based on the best ref mvs of the current
2416    // block for the given reference.
2417    step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
2418                  cpi->mv_step_param) >> 1;
2419  } else {
2420    step_param = cpi->mv_step_param;
2421  }
2422
2423  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
2424      cpi->common.show_frame) {
2425    int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
2426                                                       b_width_log2(bsize)));
2427    step_param = MAX(step_param, boffset);
2428  }
2429
2430  if (cpi->sf.adaptive_motion_search) {
2431    int bwl = b_width_log2_lookup[bsize];
2432    int bhl = b_height_log2_lookup[bsize];
2433    int i;
2434    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
2435
2436    if (tlevel < 5)
2437      step_param += 2;
2438
2439    for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
2440      if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
2441        x->pred_mv[ref].as_int = 0;
2442        tmp_mv->as_int = INVALID_MV;
2443
2444        if (scaled_ref_frame) {
2445          int i;
2446          for (i = 0; i < MAX_MB_PLANE; i++)
2447            xd->plane[i].pre[0] = backup_yv12[i];
2448        }
2449        return;
2450      }
2451    }
2452  }
2453
2454  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
2455
2456  mvp_full.col >>= 3;
2457  mvp_full.row >>= 3;
2458
2459  // Further step/diamond searches as necessary
2460  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
2461
2462  if (cpi->sf.search_method == FAST_DIAMOND) {
2463    bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
2464                                  &cpi->fn_ptr[bsize], 1,
2465                                  &ref_mv, &tmp_mv->as_mv);
2466    if (bestsme < INT_MAX)
2467      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
2468                                   &cpi->fn_ptr[bsize], 1);
2469  } else if (cpi->sf.search_method == FAST_HEX) {
2470    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
2471                                  &cpi->fn_ptr[bsize], 1,
2472                                  &ref_mv, &tmp_mv->as_mv);
2473    if (bestsme < INT_MAX)
2474      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
2475                                   &cpi->fn_ptr[bsize], 1);
2476  } else if (cpi->sf.search_method == HEX) {
2477    bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
2478                             &cpi->fn_ptr[bsize], 1,
2479                             &ref_mv, &tmp_mv->as_mv);
2480    if (bestsme < INT_MAX)
2481      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
2482                                   &cpi->fn_ptr[bsize], 1);
2483  } else if (cpi->sf.search_method == SQUARE) {
2484    bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
2485                                &cpi->fn_ptr[bsize], 1,
2486                                &ref_mv, &tmp_mv->as_mv);
2487    if (bestsme < INT_MAX)
2488      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
2489                                   &cpi->fn_ptr[bsize], 1);
2490  } else if (cpi->sf.search_method == BIGDIA) {
2491    bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
2492                                &cpi->fn_ptr[bsize], 1,
2493                                &ref_mv, &tmp_mv->as_mv);
2494    if (bestsme < INT_MAX)
2495      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
2496                                   &cpi->fn_ptr[bsize], 1);
2497  } else {
2498    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
2499                                     sadpb, further_steps, 1,
2500                                     &cpi->fn_ptr[bsize],
2501                                     &ref_mv, &tmp_mv->as_mv);
2502  }
2503
2504  x->mv_col_min = tmp_col_min;
2505  x->mv_col_max = tmp_col_max;
2506  x->mv_row_min = tmp_row_min;
2507  x->mv_row_max = tmp_row_max;
2508
2509  if (bestsme < INT_MAX) {
2510    int dis;  /* TODO: use dis in distortion calculation later. */
2511    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
2512                                 cm->allow_high_precision_mv,
2513                                 x->errorperbit,
2514                                 &cpi->fn_ptr[bsize],
2515                                 cpi->sf.subpel_force_stop,
2516                                 cpi->sf.subpel_iters_per_step,
2517                                 x->nmvjointcost, x->mvcost,
2518                                 &dis, &x->pred_sse[ref]);
2519  }
2520  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
2521                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2522
2523  if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
2524    x->pred_mv[ref].as_int = tmp_mv->as_int;
2525
2526  if (scaled_ref_frame) {
2527    int i;
2528    for (i = 0; i < MAX_MB_PLANE; i++)
2529      xd->plane[i].pre[0] = backup_yv12[i];
2530  }
2531}
2532
2533static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2534                                BLOCK_SIZE bsize,
2535                                int_mv *frame_mv,
2536                                int mi_row, int mi_col,
2537                                int_mv single_newmv[MAX_REF_FRAMES],
2538                                int *rate_mv) {
2539  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
2540  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
2541  MACROBLOCKD *xd = &x->e_mbd;
2542  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2543  const int refs[2] = { mbmi->ref_frame[0],
2544                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
2545  int_mv ref_mv[2];
2546  int ite, ref;
2547  // Prediction buffer from second frame.
2548  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2549  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
2550
2551  // Do joint motion search in compound mode to get more accurate mv.
2552  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
2553  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
2554  int last_besterr[2] = {INT_MAX, INT_MAX};
2555  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
2556    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
2557    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
2558  };
2559
2560  for (ref = 0; ref < 2; ++ref) {
2561    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
2562
2563    if (scaled_ref_frame[ref]) {
2564      int i;
2565      // Swap out the reference frame for a version that's been scaled to
2566      // match the resolution of the current frame, allowing the existing
2567      // motion search code to be used without additional modifications.
2568      for (i = 0; i < MAX_MB_PLANE; i++)
2569        backup_yv12[ref][i] = xd->plane[i].pre[ref];
2570      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
2571                           NULL);
2572    }
2573
2574    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
2575  }
2576
2577  // Allow joint search multiple times iteratively for each ref frame
2578  // and break out the search loop if it couldn't find better mv.
2579  for (ite = 0; ite < 4; ite++) {
2580    struct buf_2d ref_yv12[2];
2581    int bestsme = INT_MAX;
2582    int sadpb = x->sadperbit16;
2583    int_mv tmp_mv;
2584    int search_range = 3;
2585
2586    int tmp_col_min = x->mv_col_min;
2587    int tmp_col_max = x->mv_col_max;
2588    int tmp_row_min = x->mv_row_min;
2589    int tmp_row_max = x->mv_row_max;
2590    int id = ite % 2;
2591
2592    // Initialized here because of compiler problem in Visual Studio.
2593    ref_yv12[0] = xd->plane[0].pre[0];
2594    ref_yv12[1] = xd->plane[0].pre[1];
2595
2596    // Get pred block from second frame.
2597    vp9_build_inter_predictor(ref_yv12[!id].buf,
2598                              ref_yv12[!id].stride,
2599                              second_pred, pw,
2600                              &frame_mv[refs[!id]].as_mv,
2601                              &xd->block_refs[!id]->sf,
2602                              pw, ph, 0,
2603                              kernel, MV_PRECISION_Q3,
2604                              mi_col * MI_SIZE, mi_row * MI_SIZE);
2605
2606    // Compound motion search on first ref frame.
2607    if (id)
2608      xd->plane[0].pre[0] = ref_yv12[id];
2609    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
2610
2611    // Use mv result from single mode as mvp.
2612    tmp_mv.as_int = frame_mv[refs[id]].as_int;
2613
2614    tmp_mv.as_mv.col >>= 3;
2615    tmp_mv.as_mv.row >>= 3;
2616
2617    // Small-range full-pixel motion search
2618    bestsme = vp9_refining_search_8p_c(x, &tmp_mv.as_mv, sadpb,
2619                                       search_range,
2620                                       &cpi->fn_ptr[bsize],
2621                                       x->nmvjointcost, x->mvcost,
2622                                       &ref_mv[id].as_mv, second_pred,
2623                                       pw, ph);
2624    if (bestsme < INT_MAX)
2625      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv.as_mv, &ref_mv[id].as_mv,
2626                                      second_pred, &cpi->fn_ptr[bsize], 1);
2627
2628    x->mv_col_min = tmp_col_min;
2629    x->mv_col_max = tmp_col_max;
2630    x->mv_row_min = tmp_row_min;
2631    x->mv_row_max = tmp_row_max;
2632
2633    if (bestsme < INT_MAX) {
2634      int dis; /* TODO: use dis in distortion calculation later. */
2635      unsigned int sse;
2636      bestsme = cpi->find_fractional_mv_step_comp(
2637          x, &tmp_mv.as_mv,
2638          &ref_mv[id].as_mv,
2639          cpi->common.allow_high_precision_mv,
2640          x->errorperbit,
2641          &cpi->fn_ptr[bsize],
2642          0, cpi->sf.subpel_iters_per_step,
2643          x->nmvjointcost, x->mvcost,
2644          &dis, &sse, second_pred,
2645          pw, ph);
2646    }
2647
2648    if (id)
2649      xd->plane[0].pre[0] = scaled_first_yv12;
2650
2651    if (bestsme < last_besterr[id]) {
2652      frame_mv[refs[id]].as_int = tmp_mv.as_int;
2653      last_besterr[id] = bestsme;
2654    } else {
2655      break;
2656    }
2657  }
2658
2659  *rate_mv = 0;
2660
2661  for (ref = 0; ref < 2; ++ref) {
2662    if (scaled_ref_frame[ref]) {
2663      // restore the predictor
2664      int i;
2665      for (i = 0; i < MAX_MB_PLANE; i++)
2666        xd->plane[i].pre[ref] = backup_yv12[ref][i];
2667    }
2668
2669    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
2670                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
2671                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2672  }
2673
2674  vpx_free(second_pred);
2675}
2676
2677static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2678                                   uint8_t *orig_dst[MAX_MB_PLANE],
2679                                   int orig_dst_stride[MAX_MB_PLANE]) {
2680  int i;
2681  for (i = 0; i < MAX_MB_PLANE; i++) {
2682    xd->plane[i].dst.buf = orig_dst[i];
2683    xd->plane[i].dst.stride = orig_dst_stride[i];
2684  }
2685}
2686
2687static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2688                                 const TileInfo *const tile,
2689                                 BLOCK_SIZE bsize,
2690                                 int64_t txfm_cache[],
2691                                 int *rate2, int64_t *distortion,
2692                                 int *skippable,
2693                                 int *rate_y, int64_t *distortion_y,
2694                                 int *rate_uv, int64_t *distortion_uv,
2695                                 int *mode_excluded, int *disable_skip,
2696                                 INTERP_FILTER *best_filter,
2697                                 int_mv (*mode_mv)[MAX_REF_FRAMES],
2698                                 int mi_row, int mi_col,
2699                                 int_mv single_newmv[MAX_REF_FRAMES],
2700                                 int64_t *psse,
2701                                 const int64_t ref_best_rd) {
2702  VP9_COMMON *cm = &cpi->common;
2703  MACROBLOCKD *xd = &x->e_mbd;
2704  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2705  const int is_comp_pred = has_second_ref(mbmi);
2706  const int num_refs = is_comp_pred ? 2 : 1;
2707  const int this_mode = mbmi->mode;
2708  int_mv *frame_mv = mode_mv[this_mode];
2709  int i;
2710  int refs[2] = { mbmi->ref_frame[0],
2711    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2712  int_mv cur_mv[2];
2713  int64_t this_rd = 0;
2714  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2715  int pred_exists = 0;
2716  int intpel_mv;
2717  int64_t rd, best_rd = INT64_MAX;
2718  int best_needs_copy = 0;
2719  uint8_t *orig_dst[MAX_MB_PLANE];
2720  int orig_dst_stride[MAX_MB_PLANE];
2721  int rs = 0;
2722
2723  if (is_comp_pred) {
2724    if (frame_mv[refs[0]].as_int == INVALID_MV ||
2725        frame_mv[refs[1]].as_int == INVALID_MV)
2726      return INT64_MAX;
2727  }
2728
2729  if (this_mode == NEWMV) {
2730    int rate_mv;
2731    if (is_comp_pred) {
2732      // Initialize mv using single prediction mode result.
2733      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2734      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2735
2736      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2737        joint_motion_search(cpi, x, bsize, frame_mv,
2738                            mi_row, mi_col, single_newmv, &rate_mv);
2739      } else {
2740        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2741                                   &mbmi->ref_mvs[refs[0]][0].as_mv,
2742                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2743        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2744                                   &mbmi->ref_mvs[refs[1]][0].as_mv,
2745                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2746      }
2747      *rate2 += rate_mv;
2748    } else {
2749      int_mv tmp_mv;
2750      single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
2751                           &tmp_mv, &rate_mv);
2752      if (tmp_mv.as_int == INVALID_MV)
2753        return INT64_MAX;
2754      *rate2 += rate_mv;
2755      frame_mv[refs[0]].as_int =
2756          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2757      single_newmv[refs[0]].as_int = tmp_mv.as_int;
2758    }
2759  }
2760
2761  for (i = 0; i < num_refs; ++i) {
2762    cur_mv[i] = frame_mv[refs[i]];
2763    // Clip "next_nearest" so that it does not extend to far out of image
2764    if (this_mode != NEWMV)
2765      clamp_mv2(&cur_mv[i].as_mv, xd);
2766
2767    if (mv_check_bounds(x, &cur_mv[i].as_mv))
2768      return INT64_MAX;
2769    mbmi->mv[i].as_int = cur_mv[i].as_int;
2770  }
2771
2772  // do first prediction into the destination buffer. Do the next
2773  // prediction into a temporary buffer. Then keep track of which one
2774  // of these currently holds the best predictor, and use the other
2775  // one for future predictions. In the end, copy from tmp_buf to
2776  // dst if necessary.
2777  for (i = 0; i < MAX_MB_PLANE; i++) {
2778    orig_dst[i] = xd->plane[i].dst.buf;
2779    orig_dst_stride[i] = xd->plane[i].dst.stride;
2780  }
2781
2782  /* We don't include the cost of the second reference here, because there
2783   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2784   * words if you present them in that order, the second one is always known
2785   * if the first is known */
2786  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
2787
2788  if (!(*mode_excluded))
2789    *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
2790                                  : cm->reference_mode == COMPOUND_REFERENCE;
2791
2792  pred_exists = 0;
2793  // Are all MVs integer pel for Y and UV
2794  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2795  if (is_comp_pred)
2796    intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2797
2798  // Search for best switchable filter by checking the variance of
2799  // pred error irrespective of whether the filter will be used
2800  cpi->mask_filter_rd = 0;
2801  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2802    cpi->rd_filter_cache[i] = INT64_MAX;
2803
2804  if (cm->interp_filter != BILINEAR) {
2805    *best_filter = EIGHTTAP;
2806    if (x->source_variance <
2807        cpi->sf.disable_filter_search_var_thresh) {
2808      *best_filter = EIGHTTAP;
2809    } else {
2810      int newbest;
2811      int tmp_rate_sum = 0;
2812      int64_t tmp_dist_sum = 0;
2813
2814      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2815        int j;
2816        int64_t rs_rd;
2817        mbmi->interp_filter = i;
2818        rs = vp9_get_switchable_rate(x);
2819        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2820
2821        if (i > 0 && intpel_mv) {
2822          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2823          cpi->rd_filter_cache[i] = rd;
2824          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
2825              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2826          if (cm->interp_filter == SWITCHABLE)
2827            rd += rs_rd;
2828          cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
2829        } else {
2830          int rate_sum = 0;
2831          int64_t dist_sum = 0;
2832          if ((cm->interp_filter == SWITCHABLE &&
2833               (!i || best_needs_copy)) ||
2834              (cm->interp_filter != SWITCHABLE &&
2835               (cm->interp_filter == mbmi->interp_filter ||
2836                (i == 0 && intpel_mv)))) {
2837            restore_dst_buf(xd, orig_dst, orig_dst_stride);
2838          } else {
2839            for (j = 0; j < MAX_MB_PLANE; j++) {
2840              xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2841              xd->plane[j].dst.stride = 64;
2842            }
2843          }
2844          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2845          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
2846
2847          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2848          cpi->rd_filter_cache[i] = rd;
2849          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
2850              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2851          if (cm->interp_filter == SWITCHABLE)
2852            rd += rs_rd;
2853          cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
2854
2855          if (i == 0 && intpel_mv) {
2856            tmp_rate_sum = rate_sum;
2857            tmp_dist_sum = dist_sum;
2858          }
2859        }
2860
2861        if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2862          if (rd / 2 > ref_best_rd) {
2863            restore_dst_buf(xd, orig_dst, orig_dst_stride);
2864            return INT64_MAX;
2865          }
2866        }
2867        newbest = i == 0 || rd < best_rd;
2868
2869        if (newbest) {
2870          best_rd = rd;
2871          *best_filter = mbmi->interp_filter;
2872          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2873            best_needs_copy = !best_needs_copy;
2874        }
2875
2876        if ((cm->interp_filter == SWITCHABLE && newbest) ||
2877            (cm->interp_filter != SWITCHABLE &&
2878             cm->interp_filter == mbmi->interp_filter)) {
2879          pred_exists = 1;
2880        }
2881      }
2882      restore_dst_buf(xd, orig_dst, orig_dst_stride);
2883    }
2884  }
2885  // Set the appropriate filter
2886  mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2887      cm->interp_filter : *best_filter;
2888  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(x) : 0;
2889
2890  if (pred_exists) {
2891    if (best_needs_copy) {
2892      // again temporarily set the buffers to local memory to prevent a memcpy
2893      for (i = 0; i < MAX_MB_PLANE; i++) {
2894        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2895        xd->plane[i].dst.stride = 64;
2896      }
2897    }
2898  } else {
2899    // Handles the special case when a filter that is not in the
2900    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
2901    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2902  }
2903
2904  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2905    int tmp_rate;
2906    int64_t tmp_dist;
2907    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
2908    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2909    // if current pred_error modeled rd is substantially more than the best
2910    // so far, do not bother doing full rd
2911    if (rd / 2 > ref_best_rd) {
2912      restore_dst_buf(xd, orig_dst, orig_dst_stride);
2913      return INT64_MAX;
2914    }
2915  }
2916
2917  if (cm->interp_filter == SWITCHABLE)
2918    *rate2 += vp9_get_switchable_rate(x);
2919
2920  if (!is_comp_pred) {
2921    if (!x->in_active_map) {
2922      if (psse)
2923        *psse = 0;
2924      *distortion = 0;
2925      x->skip = 1;
2926    } else if (cpi->allow_encode_breakout && x->encode_breakout) {
2927      const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
2928      const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
2929      unsigned int var, sse;
2930      // Skipping threshold for ac.
2931      unsigned int thresh_ac;
2932      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
2933      // Use extreme low threshold for static frames to limit skipping.
2934      const unsigned int max_thresh = (cpi->allow_encode_breakout ==
2935                                      ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
2936      // The encode_breakout input
2937      const unsigned int min_thresh =
2938          MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
2939
2940      // Calculate threshold according to dequant value.
2941      thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
2942      thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
2943
2944      var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
2945                                   xd->plane[0].dst.buf,
2946                                   xd->plane[0].dst.stride, &sse);
2947
2948      // Adjust threshold according to partition size.
2949      thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
2950          b_height_log2_lookup[bsize]);
2951
2952      // Y skipping condition checking
2953      if (sse < thresh_ac || sse == 0) {
2954        // Skipping threshold for dc
2955        unsigned int thresh_dc;
2956
2957        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
2958
2959        // dc skipping checking
2960        if ((sse - var) < thresh_dc || sse == var) {
2961          unsigned int sse_u, sse_v;
2962          unsigned int var_u, var_v;
2963
2964          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
2965                                          x->plane[1].src.stride,
2966                                          xd->plane[1].dst.buf,
2967                                          xd->plane[1].dst.stride, &sse_u);
2968
2969          // U skipping condition checking
2970          if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
2971              (sse_u - var_u < thresh_dc || sse_u == var_u)) {
2972            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
2973                                            x->plane[2].src.stride,
2974                                            xd->plane[2].dst.buf,
2975                                            xd->plane[2].dst.stride, &sse_v);
2976
2977            // V skipping condition checking
2978            if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
2979                (sse_v - var_v < thresh_dc || sse_v == var_v)) {
2980              x->skip = 1;
2981
2982              // The cost of skip bit needs to be added.
2983              *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2984
2985              // Scaling factor for SSE from spatial domain to frequency domain
2986              // is 16. Adjust distortion accordingly.
2987              *distortion_uv = (sse_u + sse_v) << 4;
2988              *distortion = (sse << 4) + *distortion_uv;
2989
2990              *disable_skip = 1;
2991              this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2992            }
2993          }
2994        }
2995      }
2996    }
2997  }
2998
2999  if (!x->skip) {
3000    int skippable_y, skippable_uv;
3001    int64_t sseuv = INT64_MAX;
3002    int64_t rdcosty = INT64_MAX;
3003
3004    // Y cost and distortion
3005    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
3006                          bsize, txfm_cache, ref_best_rd);
3007
3008    if (*rate_y == INT_MAX) {
3009      *rate2 = INT_MAX;
3010      *distortion = INT64_MAX;
3011      restore_dst_buf(xd, orig_dst, orig_dst_stride);
3012      return INT64_MAX;
3013    }
3014
3015    *rate2 += *rate_y;
3016    *distortion += *distortion_y;
3017
3018    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
3019    rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
3020
3021    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
3022                     bsize, ref_best_rd - rdcosty);
3023    if (*rate_uv == INT_MAX) {
3024      *rate2 = INT_MAX;
3025      *distortion = INT64_MAX;
3026      restore_dst_buf(xd, orig_dst, orig_dst_stride);
3027      return INT64_MAX;
3028    }
3029
3030    *psse += sseuv;
3031    *rate2 += *rate_uv;
3032    *distortion += *distortion_uv;
3033    *skippable = skippable_y && skippable_uv;
3034  }
3035
3036  restore_dst_buf(xd, orig_dst, orig_dst_stride);
3037  return this_rd;  // if 0, this will be re-calculated by caller
3038}
3039
3040static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
3041                           int max_plane) {
3042  struct macroblock_plane *const p = x->plane;
3043  struct macroblockd_plane *const pd = x->e_mbd.plane;
3044  int i;
3045
3046  for (i = 0; i < max_plane; ++i) {
3047    p[i].coeff    = ctx->coeff_pbuf[i][1];
3048    p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
3049    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
3050    p[i].eobs    = ctx->eobs_pbuf[i][1];
3051
3052    ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
3053    ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
3054    ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
3055    ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
3056
3057    ctx->coeff_pbuf[i][0]   = p[i].coeff;
3058    ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
3059    ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
3060    ctx->eobs_pbuf[i][0]    = p[i].eobs;
3061  }
3062}
3063
3064void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3065                               int *returnrate, int64_t *returndist,
3066                               BLOCK_SIZE bsize,
3067                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
3068  VP9_COMMON *const cm = &cpi->common;
3069  MACROBLOCKD *const xd = &x->e_mbd;
3070  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
3071  int y_skip = 0, uv_skip = 0;
3072  int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
3073  TX_SIZE max_uv_tx_size;
3074  x->skip_encode = 0;
3075  ctx->skip = 0;
3076  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
3077
3078  if (bsize >= BLOCK_8X8) {
3079    if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
3080                               &dist_y, &y_skip, bsize, tx_cache,
3081                               best_rd) >= best_rd) {
3082      *returnrate = INT_MAX;
3083      return;
3084    }
3085    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
3086    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
3087                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
3088  } else {
3089    y_skip = 0;
3090    if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
3091                                     &dist_y, best_rd) >= best_rd) {
3092      *returnrate = INT_MAX;
3093      return;
3094    }
3095    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
3096    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
3097                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
3098  }
3099
3100  if (y_skip && uv_skip) {
3101    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
3102                  vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3103    *returndist = dist_y + dist_uv;
3104    vp9_zero(ctx->tx_rd_diff);
3105  } else {
3106    int i;
3107    *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3108    *returndist = dist_y + dist_uv;
3109    if (cpi->sf.tx_size_search_method == USE_FULL_RD)
3110      for (i = 0; i < TX_MODES; i++) {
3111        if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
3112          ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
3113        else
3114          ctx->tx_rd_diff[i] = 0;
3115      }
3116  }
3117
3118  ctx->mic = *xd->mi[0];
3119}
3120
3121int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3122                                  const TileInfo *const tile,
3123                                  int mi_row, int mi_col,
3124                                  int *returnrate,
3125                                  int64_t *returndistortion,
3126                                  BLOCK_SIZE bsize,
3127                                  PICK_MODE_CONTEXT *ctx,
3128                                  int64_t best_rd_so_far) {
3129  VP9_COMMON *const cm = &cpi->common;
3130  MACROBLOCKD *const xd = &x->e_mbd;
3131  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3132  const struct segmentation *const seg = &cm->seg;
3133  MB_PREDICTION_MODE this_mode;
3134  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3135  unsigned char segment_id = mbmi->segment_id;
3136  int comp_pred, i;
3137  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3138  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3139  int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
3140  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3141                                    VP9_ALT_FLAG };
3142  int64_t best_rd = best_rd_so_far;
3143  int64_t best_tx_rd[TX_MODES];
3144  int64_t best_tx_diff[TX_MODES];
3145  int64_t best_pred_diff[REFERENCE_MODES];
3146  int64_t best_pred_rd[REFERENCE_MODES];
3147  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3148  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3149  MB_MODE_INFO best_mbmode = { 0 };
3150  int mode_index, best_mode_index = 0;
3151  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3152  vp9_prob comp_mode_p;
3153  int64_t best_intra_rd = INT64_MAX;
3154  int64_t best_inter_rd = INT64_MAX;
3155  MB_PREDICTION_MODE best_intra_mode = DC_PRED;
3156  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3157  INTERP_FILTER tmp_best_filter = SWITCHABLE;
3158  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3159  int64_t dist_uv[TX_SIZES];
3160  int skip_uv[TX_SIZES];
3161  MB_PREDICTION_MODE mode_uv[TX_SIZES];
3162  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
3163  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3164  const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
3165  const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
3166  int best_skip2 = 0;
3167  int mode_skip_mask = 0;
3168  int mode_skip_start = cpi->sf.mode_skip_start + 1;
3169  const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
3170  const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
3171  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
3172  const int intra_y_mode_mask =
3173      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
3174  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
3175
3176  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3177
3178  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
3179                           &comp_mode_p);
3180
3181  for (i = 0; i < REFERENCE_MODES; ++i)
3182    best_pred_rd[i] = INT64_MAX;
3183  for (i = 0; i < TX_MODES; i++)
3184    best_tx_rd[i] = INT64_MAX;
3185  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3186    best_filter_rd[i] = INT64_MAX;
3187  for (i = 0; i < TX_SIZES; i++)
3188    rate_uv_intra[i] = INT_MAX;
3189  for (i = 0; i < MAX_REF_FRAMES; ++i)
3190    x->pred_sse[i] = INT_MAX;
3191
3192  *returnrate = INT_MAX;
3193
3194  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
3195    x->pred_mv_sad[ref_frame] = INT_MAX;
3196    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3197      vp9_setup_buffer_inter(cpi, x, tile,
3198                             ref_frame, bsize, mi_row, mi_col,
3199                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
3200    }
3201    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3202    frame_mv[ZEROMV][ref_frame].as_int = 0;
3203  }
3204
3205  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
3206    // All modes from vp9_mode_order that use this frame as any ref
3207    static const int ref_frame_mask_all[] = {
3208        0x0, 0x123291, 0x25c444, 0x39b722
3209    };
3210    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
3211    // this frame as their primary ref
3212    static const int ref_frame_mask_fixedmv[] = {
3213        0x0, 0x121281, 0x24c404, 0x080102
3214    };
3215    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
3216      // Skip modes for missing references
3217      mode_skip_mask |= ref_frame_mask_all[ref_frame];
3218    } else if (cpi->sf.reference_masking) {
3219      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3220        // Skip fixed mv modes for poor references
3221        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
3222          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
3223          break;
3224        }
3225      }
3226    }
3227    // If the segment reference frame feature is enabled....
3228    // then do nothing if the current ref frame is not allowed..
3229    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3230        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
3231      mode_skip_mask |= ref_frame_mask_all[ref_frame];
3232    }
3233  }
3234
3235  // If the segment skip feature is enabled....
3236  // then do nothing if the current mode is not allowed..
3237  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
3238    const int inter_non_zero_mode_mask = 0x1F7F7;
3239    mode_skip_mask |= inter_non_zero_mode_mask;
3240  }
3241
3242  // Disable this drop out case if the ref frame
3243  // segment level feature is enabled for this segment. This is to
3244  // prevent the possibility that we end up unable to pick any mode.
3245  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3246    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3247    // unless ARNR filtering is enabled in which case we want
3248    // an unfiltered alternative. We allow near/nearest as well
3249    // because they may result in zero-zero MVs but be cheaper.
3250    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
3251      const int altref_zero_mask =
3252          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
3253      mode_skip_mask |= altref_zero_mask;
3254      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
3255        mode_skip_mask |= (1 << THR_NEARA);
3256      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
3257        mode_skip_mask |= (1 << THR_NEARESTA);
3258    }
3259  }
3260
3261  // TODO(JBB): This is to make up for the fact that we don't have sad
3262  // functions that work when the block size reads outside the umv.  We
3263  // should fix this either by making the motion search just work on
3264  // a representative block in the boundary ( first ) and then implement a
3265  // function that does sads when inside the border..
3266  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
3267    const int new_modes_mask =
3268        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
3269        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
3270    mode_skip_mask |= new_modes_mask;
3271  }
3272
3273  if (bsize > cpi->sf.max_intra_bsize) {
3274    mode_skip_mask |= 0xFF30808;
3275  }
3276
3277  if (!x->in_active_map) {
3278    int mode_index;
3279    assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
3280    if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
3281      mode_index = THR_NEARESTMV;
3282    else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
3283      mode_index = THR_NEARMV;
3284    else
3285      mode_index = THR_ZEROMV;
3286    mode_skip_mask = ~(1 << mode_index);
3287    mode_skip_start = MAX_MODES;
3288    disable_inter_mode_mask = 0;
3289  }
3290
3291  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3292    int mode_excluded = 0;
3293    int64_t this_rd = INT64_MAX;
3294    int disable_skip = 0;
3295    int compmode_cost = 0;
3296    int rate2 = 0, rate_y = 0, rate_uv = 0;
3297    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3298    int skippable = 0;
3299    int64_t tx_cache[TX_MODES];
3300    int i;
3301    int this_skip2 = 0;
3302    int64_t total_sse = INT64_MAX;
3303    int early_term = 0;
3304
3305    // Look at the reference frame of the best mode so far and set the
3306    // skip mask to look at a subset of the remaining modes.
3307    if (mode_index == mode_skip_start) {
3308      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
3309        case INTRA_FRAME:
3310          break;
3311        case LAST_FRAME:
3312          mode_skip_mask |= LAST_FRAME_MODE_MASK;
3313          break;
3314        case GOLDEN_FRAME:
3315          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
3316          break;
3317        case ALTREF_FRAME:
3318          mode_skip_mask |= ALT_REF_MODE_MASK;
3319          break;
3320        case NONE:
3321        case MAX_REF_FRAMES:
3322          assert(0 && "Invalid Reference frame");
3323      }
3324    }
3325    if (mode_skip_mask & (1 << mode_index))
3326      continue;
3327
3328    // Test best rd so far against threshold for trying this mode.
3329    if (best_rd < ((int64_t)rd_threshes[mode_index] *
3330                  rd_thresh_freq_fact[mode_index] >> 5) ||
3331        rd_threshes[mode_index] == INT_MAX)
3332     continue;
3333
3334    this_mode = vp9_mode_order[mode_index].mode;
3335    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
3336    if (ref_frame != INTRA_FRAME &&
3337        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
3338      continue;
3339    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
3340
3341    comp_pred = second_ref_frame > INTRA_FRAME;
3342    if (comp_pred) {
3343      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3344          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
3345        continue;
3346      if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
3347          ref_frame != best_inter_ref_frame &&
3348          second_ref_frame != best_inter_ref_frame)
3349        continue;
3350      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3351    } else {
3352      if (ref_frame != INTRA_FRAME)
3353        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3354    }
3355
3356    if (ref_frame == INTRA_FRAME) {
3357      if (!(intra_y_mode_mask & (1 << this_mode)))
3358        continue;
3359      if (this_mode != DC_PRED) {
3360        // Disable intra modes other than DC_PRED for blocks with low variance
3361        // Threshold for intra skipping based on source variance
3362        // TODO(debargha): Specialize the threshold for super block sizes
3363        const unsigned int skip_intra_var_thresh = 64;
3364        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
3365            x->source_variance < skip_intra_var_thresh)
3366          continue;
3367        // Only search the oblique modes if the best so far is
3368        // one of the neighboring directional modes
3369        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3370            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3371          if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
3372            continue;
3373        }
3374        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3375          if (conditional_skipintra(this_mode, best_intra_mode))
3376              continue;
3377        }
3378      }
3379    } else {
3380      if (x->in_active_map &&
3381          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
3382        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
3383                                disable_inter_mode_mask, this_mode, ref_frame,
3384                                second_ref_frame))
3385          continue;
3386    }
3387
3388    mbmi->mode = this_mode;
3389    mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
3390    mbmi->ref_frame[0] = ref_frame;
3391    mbmi->ref_frame[1] = second_ref_frame;
3392    // Evaluate all sub-pel filters irrespective of whether we can use
3393    // them for this frame.
3394    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3395                                                          : cm->interp_filter;
3396    x->skip = 0;
3397    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3398
3399    // Select prediction reference frames.
3400    for (i = 0; i < MAX_MB_PLANE; i++) {
3401      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3402      if (comp_pred)
3403        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3404    }
3405
3406    for (i = 0; i < TX_MODES; ++i)
3407      tx_cache[i] = INT64_MAX;
3408
3409#ifdef MODE_TEST_HIT_STATS
3410    // TEST/DEBUG CODE
3411    // Keep a rcord of the number of test hits at each size
3412    cpi->mode_test_hits[bsize]++;
3413#endif
3414
3415    if (ref_frame == INTRA_FRAME) {
3416      TX_SIZE uv_tx;
3417      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
3418                            bsize, tx_cache, best_rd);
3419
3420      if (rate_y == INT_MAX)
3421        continue;
3422
3423      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
3424      if (rate_uv_intra[uv_tx] == INT_MAX) {
3425        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
3426                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
3427                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
3428      }
3429
3430      rate_uv = rate_uv_tokenonly[uv_tx];
3431      distortion_uv = dist_uv[uv_tx];
3432      skippable = skippable && skip_uv[uv_tx];
3433      mbmi->uv_mode = mode_uv[uv_tx];
3434
3435      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3436      if (this_mode != DC_PRED && this_mode != TM_PRED)
3437        rate2 += intra_cost_penalty;
3438      distortion2 = distortion_y + distortion_uv;
3439    } else {
3440      this_rd = handle_inter_mode(cpi, x, tile, bsize,
3441                                  tx_cache,
3442                                  &rate2, &distortion2, &skippable,
3443                                  &rate_y, &distortion_y,
3444                                  &rate_uv, &distortion_uv,
3445                                  &mode_excluded, &disable_skip,
3446                                  &tmp_best_filter, frame_mv,
3447                                  mi_row, mi_col,
3448                                  single_newmv, &total_sse, best_rd);
3449      if (this_rd == INT64_MAX)
3450        continue;
3451
3452      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3453
3454      if (cm->reference_mode == REFERENCE_MODE_SELECT)
3455        rate2 += compmode_cost;
3456    }
3457
3458    // Estimate the reference frame signaling cost and add it
3459    // to the rolling cost variable.
3460    if (comp_pred) {
3461      rate2 += ref_costs_comp[ref_frame];
3462    } else {
3463      rate2 += ref_costs_single[ref_frame];
3464    }
3465
3466    if (!disable_skip) {
3467      // Test for the condition where skip block will be activated
3468      // because there are no non zero coefficients and make any
3469      // necessary adjustment for rate. Ignore if skip is coded at
3470      // segment level as the cost wont have been added in.
3471      // Is Mb level skip allowed (i.e. not coded at segment level).
3472      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
3473                                                         SEG_LVL_SKIP);
3474
3475      if (skippable) {
3476        // Back out the coefficient coding costs
3477        rate2 -= (rate_y + rate_uv);
3478        // for best yrd calculation
3479        rate_uv = 0;
3480
3481        if (mb_skip_allowed) {
3482          int prob_skip_cost;
3483
3484          // Cost the skip mb case
3485          vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
3486          if (skip_prob) {
3487            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
3488            rate2 += prob_skip_cost;
3489          }
3490        }
3491      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
3492        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3493            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3494          // Add in the cost of the no skip flag.
3495          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3496        } else {
3497          // FIXME(rbultje) make this work for splitmv also
3498          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3499          distortion2 = total_sse;
3500          assert(total_sse >= 0);
3501          rate2 -= (rate_y + rate_uv);
3502          rate_y = 0;
3503          rate_uv = 0;
3504          this_skip2 = 1;
3505        }
3506      } else if (mb_skip_allowed) {
3507        // Add in the cost of the no skip flag.
3508        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3509      }
3510
3511      // Calculate the final RD estimate for this mode.
3512      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3513    }
3514
3515    if (ref_frame == INTRA_FRAME) {
3516    // Keep record of best intra rd
3517      if (this_rd < best_intra_rd) {
3518        best_intra_rd = this_rd;
3519        best_intra_mode = mbmi->mode;
3520      }
3521    } else {
3522      // Keep record of best inter rd with single reference
3523      if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
3524        best_inter_rd = this_rd;
3525        best_inter_ref_frame = ref_frame;
3526      }
3527    }
3528
3529    if (!disable_skip && ref_frame == INTRA_FRAME) {
3530      for (i = 0; i < REFERENCE_MODES; ++i)
3531        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3532      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3533        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3534    }
3535
3536    // Store the respective mode distortions for later use.
3537    if (mode_distortions[this_mode] == -1
3538        || distortion2 < mode_distortions[this_mode]) {
3539      mode_distortions[this_mode] = distortion2;
3540    }
3541
3542    // Did this mode help.. i.e. is it the new best mode
3543    if (this_rd < best_rd || x->skip) {
3544      int max_plane = MAX_MB_PLANE;
3545      if (!mode_excluded) {
3546        // Note index of best mode so far
3547        best_mode_index = mode_index;
3548
3549        if (ref_frame == INTRA_FRAME) {
3550          /* required for left and above block mv */
3551          mbmi->mv[0].as_int = 0;
3552          max_plane = 1;
3553        }
3554
3555        *returnrate = rate2;
3556        *returndistortion = distortion2;
3557        best_rd = this_rd;
3558        best_mbmode = *mbmi;
3559        best_skip2 = this_skip2;
3560        if (!x->select_txfm_size)
3561          swap_block_ptr(x, ctx, max_plane);
3562        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
3563                   sizeof(uint8_t) * ctx->num_4x4_blk);
3564
3565        // TODO(debargha): enhance this test with a better distortion prediction
3566        // based on qp, activity mask and history
3567        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3568            (mode_index > MIN_EARLY_TERM_INDEX)) {
3569          const int qstep = xd->plane[0].dequant[1];
3570          // TODO(debargha): Enhance this by specializing for each mode_index
3571          int scale = 4;
3572          if (x->source_variance < UINT_MAX) {
3573            const int var_adjust = (x->source_variance < 16);
3574            scale -= var_adjust;
3575          }
3576          if (ref_frame > INTRA_FRAME &&
3577              distortion2 * scale < qstep * qstep) {
3578            early_term = 1;
3579          }
3580        }
3581      }
3582    }
3583
3584    /* keep record of best compound/single-only prediction */
3585    if (!disable_skip && ref_frame != INTRA_FRAME) {
3586      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3587
3588      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3589        single_rate = rate2 - compmode_cost;
3590        hybrid_rate = rate2;
3591      } else {
3592        single_rate = rate2;
3593        hybrid_rate = rate2 + compmode_cost;
3594      }
3595
3596      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3597      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3598
3599      if (!comp_pred) {
3600        if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
3601          best_pred_rd[SINGLE_REFERENCE] = single_rd;
3602        }
3603      } else {
3604        if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
3605          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3606        }
3607      }
3608      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3609        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3610
3611      /* keep record of best filter type */
3612      if (!mode_excluded && cm->interp_filter != BILINEAR) {
3613        int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
3614                              SWITCHABLE_FILTERS : cm->interp_filter];
3615
3616        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3617          int64_t adj_rd;
3618          if (ref == INT64_MAX)
3619            adj_rd = 0;
3620          else if (cpi->rd_filter_cache[i] == INT64_MAX)
3621            // when early termination is triggered, the encoder does not have
3622            // access to the rate-distortion cost. it only knows that the cost
3623            // should be above the maximum valid value. hence it takes the known
3624            // maximum plus an arbitrary constant as the rate-distortion cost.
3625            adj_rd = cpi->mask_filter_rd - ref + 10;
3626          else
3627            adj_rd = cpi->rd_filter_cache[i] - ref;
3628
3629          adj_rd += this_rd;
3630          best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3631        }
3632      }
3633    }
3634
3635    /* keep record of best txfm size */
3636    if (bsize < BLOCK_32X32) {
3637      if (bsize < BLOCK_16X16)
3638        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
3639
3640      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
3641    }
3642    if (!mode_excluded && this_rd != INT64_MAX) {
3643      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
3644        int64_t adj_rd = INT64_MAX;
3645        adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
3646
3647        if (adj_rd < best_tx_rd[i])
3648          best_tx_rd[i] = adj_rd;
3649      }
3650    }
3651
3652    if (early_term)
3653      break;
3654
3655    if (x->skip && !comp_pred)
3656      break;
3657  }
3658
3659  if (best_rd >= best_rd_so_far)
3660    return INT64_MAX;
3661
3662  // If we used an estimate for the uv intra rd in the loop above...
3663  if (cpi->sf.use_uv_intra_rd_estimate) {
3664    // Do Intra UV best rd mode selection if best mode choice above was intra.
3665    if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
3666      TX_SIZE uv_tx_size;
3667      *mbmi = best_mbmode;
3668      uv_tx_size = get_uv_tx_size(mbmi);
3669      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
3670                              &rate_uv_tokenonly[uv_tx_size],
3671                              &dist_uv[uv_tx_size],
3672                              &skip_uv[uv_tx_size],
3673                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
3674                              uv_tx_size);
3675    }
3676  }
3677
3678  assert((cm->interp_filter == SWITCHABLE) ||
3679         (cm->interp_filter == best_mbmode.interp_filter) ||
3680         !is_inter_block(&best_mbmode));
3681
3682  // Updating rd_thresh_freq_fact[] here means that the different
3683  // partition/block sizes are handled independently based on the best
3684  // choice for the current partition. It may well be better to keep a scaled
3685  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
3686  // combination that wins out.
3687  if (cpi->sf.adaptive_rd_thresh) {
3688    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3689      int *const fact = &cpi->rd_thresh_freq_fact[bsize][mode_index];
3690
3691      if (mode_index == best_mode_index) {
3692        *fact -= (*fact >> 3);
3693      } else {
3694        *fact = MIN(*fact + RD_THRESH_INC,
3695                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
3696      }
3697    }
3698  }
3699
3700  // macroblock modes
3701  *mbmi = best_mbmode;
3702  x->skip |= best_skip2;
3703
3704  for (i = 0; i < REFERENCE_MODES; ++i) {
3705    if (best_pred_rd[i] == INT64_MAX)
3706      best_pred_diff[i] = INT_MIN;
3707    else
3708      best_pred_diff[i] = best_rd - best_pred_rd[i];
3709  }
3710
3711  if (!x->skip) {
3712    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3713      if (best_filter_rd[i] == INT64_MAX)
3714        best_filter_diff[i] = 0;
3715      else
3716        best_filter_diff[i] = best_rd - best_filter_rd[i];
3717    }
3718    if (cm->interp_filter == SWITCHABLE)
3719      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3720    for (i = 0; i < TX_MODES; i++) {
3721      if (best_tx_rd[i] == INT64_MAX)
3722        best_tx_diff[i] = 0;
3723      else
3724        best_tx_diff[i] = best_rd - best_tx_rd[i];
3725    }
3726  } else {
3727    vp9_zero(best_filter_diff);
3728    vp9_zero(best_tx_diff);
3729  }
3730
3731  if (!x->in_active_map) {
3732    assert(mbmi->ref_frame[0] == LAST_FRAME);
3733    assert(mbmi->ref_frame[1] == NONE);
3734    assert(mbmi->mode == NEARESTMV ||
3735           mbmi->mode == NEARMV ||
3736           mbmi->mode == ZEROMV);
3737    assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
3738    assert(mbmi->mode == mbmi->uv_mode);
3739  }
3740
3741  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
3742  store_coding_context(x, ctx, best_mode_index,
3743                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
3744                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
3745                                      mbmi->ref_frame[1]][0],
3746                       best_pred_diff, best_tx_diff, best_filter_diff);
3747
3748  return best_rd;
3749}
3750
3751
3752int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
3753                                      const TileInfo *const tile,
3754                                      int mi_row, int mi_col,
3755                                      int *returnrate,
3756                                      int64_t *returndistortion,
3757                                      BLOCK_SIZE bsize,
3758                                      PICK_MODE_CONTEXT *ctx,
3759                                      int64_t best_rd_so_far) {
3760  VP9_COMMON *cm = &cpi->common;
3761  MACROBLOCKD *xd = &x->e_mbd;
3762  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
3763  const struct segmentation *seg = &cm->seg;
3764  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3765  unsigned char segment_id = mbmi->segment_id;
3766  int comp_pred, i;
3767  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3768  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3769  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3770                                    VP9_ALT_FLAG };
3771  int64_t best_rd = best_rd_so_far;
3772  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3773  int64_t best_tx_rd[TX_MODES];
3774  int64_t best_tx_diff[TX_MODES];
3775  int64_t best_pred_diff[REFERENCE_MODES];
3776  int64_t best_pred_rd[REFERENCE_MODES];
3777  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3778  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3779  MB_MODE_INFO best_mbmode = { 0 };
3780  int mode_index, best_mode_index = 0;
3781  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3782  vp9_prob comp_mode_p;
3783  int64_t best_inter_rd = INT64_MAX;
3784  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3785  INTERP_FILTER tmp_best_filter = SWITCHABLE;
3786  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3787  int64_t dist_uv[TX_SIZES];
3788  int skip_uv[TX_SIZES];
3789  MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
3790  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3791  int_mv seg_mvs[4][MAX_REF_FRAMES];
3792  b_mode_info best_bmodes[4];
3793  int best_skip2 = 0;
3794  int ref_frame_mask = 0;
3795  int mode_skip_mask = 0;
3796
3797  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3798  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
3799
3800  for (i = 0; i < 4; i++) {
3801    int j;
3802    for (j = 0; j < MAX_REF_FRAMES; j++)
3803      seg_mvs[i][j].as_int = INVALID_MV;
3804  }
3805
3806  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
3807                           &comp_mode_p);
3808
3809  for (i = 0; i < REFERENCE_MODES; ++i)
3810    best_pred_rd[i] = INT64_MAX;
3811  for (i = 0; i < TX_MODES; i++)
3812    best_tx_rd[i] = INT64_MAX;
3813  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3814    best_filter_rd[i] = INT64_MAX;
3815  for (i = 0; i < TX_SIZES; i++)
3816    rate_uv_intra[i] = INT_MAX;
3817
3818  *returnrate = INT_MAX;
3819
3820  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3821    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3822      vp9_setup_buffer_inter(cpi, x, tile,
3823                             ref_frame, bsize, mi_row, mi_col,
3824                             frame_mv[NEARESTMV], frame_mv[NEARMV],
3825                             yv12_mb);
3826    }
3827    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3828    frame_mv[ZEROMV][ref_frame].as_int = 0;
3829  }
3830
3831  for (ref_frame = LAST_FRAME;
3832       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
3833    int i;
3834    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3835      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
3836        ref_frame_mask |= (1 << ref_frame);
3837        break;
3838      }
3839    }
3840  }
3841
3842  for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
3843    int mode_excluded = 0;
3844    int64_t this_rd = INT64_MAX;
3845    int disable_skip = 0;
3846    int compmode_cost = 0;
3847    int rate2 = 0, rate_y = 0, rate_uv = 0;
3848    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3849    int skippable = 0;
3850    int64_t tx_cache[TX_MODES];
3851    int i;
3852    int this_skip2 = 0;
3853    int64_t total_sse = INT_MAX;
3854    int early_term = 0;
3855
3856    for (i = 0; i < TX_MODES; ++i)
3857      tx_cache[i] = INT64_MAX;
3858
3859    x->skip = 0;
3860    ref_frame = vp9_ref_order[mode_index].ref_frame[0];
3861    second_ref_frame = vp9_ref_order[mode_index].ref_frame[1];
3862
3863    // Look at the reference frame of the best mode so far and set the
3864    // skip mask to look at a subset of the remaining modes.
3865    if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
3866      if (mode_index == 3) {
3867        switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
3868          case INTRA_FRAME:
3869            mode_skip_mask = 0;
3870            break;
3871          case LAST_FRAME:
3872            mode_skip_mask = 0x0010;
3873            break;
3874          case GOLDEN_FRAME:
3875            mode_skip_mask = 0x0008;
3876            break;
3877          case ALTREF_FRAME:
3878            mode_skip_mask = 0x0000;
3879            break;
3880          case NONE:
3881          case MAX_REF_FRAMES:
3882            assert(0 && "Invalid Reference frame");
3883        }
3884      }
3885      if (mode_skip_mask & (1 << mode_index))
3886        continue;
3887    }
3888
3889    // Test best rd so far against threshold for trying this mode.
3890    if ((best_rd <
3891         ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
3892          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
3893        cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
3894      continue;
3895
3896    // Do not allow compound prediction if the segment level reference
3897    // frame feature is in use as in this case there can only be one reference.
3898    if ((second_ref_frame > INTRA_FRAME) &&
3899         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3900      continue;
3901
3902    mbmi->ref_frame[0] = ref_frame;
3903    mbmi->ref_frame[1] = second_ref_frame;
3904
3905    if (!(ref_frame == INTRA_FRAME
3906        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
3907      continue;
3908    }
3909    if (!(second_ref_frame == NONE
3910        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
3911      continue;
3912    }
3913
3914    comp_pred = second_ref_frame > INTRA_FRAME;
3915    if (comp_pred) {
3916      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
3917        if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
3918          continue;
3919      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
3920        if (ref_frame != best_inter_ref_frame &&
3921            second_ref_frame != best_inter_ref_frame)
3922          continue;
3923    }
3924
3925    // TODO(jingning, jkoleszar): scaling reference frame not supported for
3926    // sub8x8 blocks.
3927    if (ref_frame > 0 && vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3928      continue;
3929
3930    if (second_ref_frame > 0 &&
3931        vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3932      continue;
3933
3934    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3935    mbmi->uv_mode = DC_PRED;
3936
3937    // Evaluate all sub-pel filters irrespective of whether we can use
3938    // them for this frame.
3939    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3940                                                          : cm->interp_filter;
3941
3942    if (comp_pred) {
3943      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3944        continue;
3945
3946      mode_excluded = mode_excluded ? mode_excluded
3947                                    : cm->reference_mode == SINGLE_REFERENCE;
3948    } else {
3949      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
3950        mode_excluded = mode_excluded ?
3951            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
3952      }
3953    }
3954
3955    // Select prediction reference frames.
3956    for (i = 0; i < MAX_MB_PLANE; i++) {
3957      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3958      if (comp_pred)
3959        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3960    }
3961
3962    // If the segment reference frame feature is enabled....
3963    // then do nothing if the current ref frame is not allowed..
3964    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3965        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
3966            (int)ref_frame) {
3967      continue;
3968    // If the segment skip feature is enabled....
3969    // then do nothing if the current mode is not allowed..
3970    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
3971               ref_frame != INTRA_FRAME) {
3972      continue;
3973    // Disable this drop out case if the ref frame
3974    // segment level feature is enabled for this segment. This is to
3975    // prevent the possibility that we end up unable to pick any mode.
3976    } else if (!vp9_segfeature_active(seg, segment_id,
3977                                      SEG_LVL_REF_FRAME)) {
3978      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3979      // unless ARNR filtering is enabled in which case we want
3980      // an unfiltered alternative. We allow near/nearest as well
3981      // because they may result in zero-zero MVs but be cheaper.
3982      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3983        continue;
3984    }
3985
3986#ifdef MODE_TEST_HIT_STATS
3987    // TEST/DEBUG CODE
3988    // Keep a rcord of the number of test hits at each size
3989    cpi->mode_test_hits[bsize]++;
3990#endif
3991
3992    if (ref_frame == INTRA_FRAME) {
3993      int rate;
3994      mbmi->tx_size = TX_4X4;
3995      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3996                                       &distortion_y, best_rd) >= best_rd)
3997        continue;
3998      rate2 += rate;
3999      rate2 += intra_cost_penalty;
4000      distortion2 += distortion_y;
4001
4002      if (rate_uv_intra[TX_4X4] == INT_MAX) {
4003        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
4004                             &rate_uv_intra[TX_4X4],
4005                             &rate_uv_tokenonly[TX_4X4],
4006                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
4007                             &mode_uv[TX_4X4]);
4008      }
4009      rate2 += rate_uv_intra[TX_4X4];
4010      rate_uv = rate_uv_tokenonly[TX_4X4];
4011      distortion2 += dist_uv[TX_4X4];
4012      distortion_uv = dist_uv[TX_4X4];
4013      mbmi->uv_mode = mode_uv[TX_4X4];
4014      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4015      for (i = 0; i < TX_MODES; ++i)
4016        tx_cache[i] = tx_cache[ONLY_4X4];
4017    } else {
4018      int rate;
4019      int64_t distortion;
4020      int64_t this_rd_thresh;
4021      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
4022      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
4023      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
4024      int tmp_best_skippable = 0;
4025      int switchable_filter_index;
4026      int_mv *second_ref = comp_pred ?
4027                             &mbmi->ref_mvs[second_ref_frame][0] : NULL;
4028      b_mode_info tmp_best_bmodes[16];
4029      MB_MODE_INFO tmp_best_mbmode;
4030      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
4031      int pred_exists = 0;
4032      int uv_skippable;
4033
4034      this_rd_thresh = (ref_frame == LAST_FRAME) ?
4035          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
4036          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
4037      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
4038          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
4039      xd->mi[0]->mbmi.tx_size = TX_4X4;
4040
4041      cpi->mask_filter_rd = 0;
4042      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
4043        cpi->rd_filter_cache[i] = INT64_MAX;
4044
4045      if (cm->interp_filter != BILINEAR) {
4046        tmp_best_filter = EIGHTTAP;
4047        if (x->source_variance <
4048            cpi->sf.disable_filter_search_var_thresh) {
4049          tmp_best_filter = EIGHTTAP;
4050        } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
4051                   ctx->pred_interp_filter < SWITCHABLE) {
4052          tmp_best_filter = ctx->pred_interp_filter;
4053        } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
4054          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
4055                              ctx->pred_interp_filter : 0;
4056        } else {
4057          for (switchable_filter_index = 0;
4058               switchable_filter_index < SWITCHABLE_FILTERS;
4059               ++switchable_filter_index) {
4060            int newbest, rs;
4061            int64_t rs_rd;
4062            mbmi->interp_filter = switchable_filter_index;
4063            tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
4064                                                 &mbmi->ref_mvs[ref_frame][0],
4065                                                 second_ref,
4066                                                 best_yrd,
4067                                                 &rate, &rate_y, &distortion,
4068                                                 &skippable, &total_sse,
4069                                                 (int)this_rd_thresh, seg_mvs,
4070                                                 bsi, switchable_filter_index,
4071                                                 mi_row, mi_col);
4072
4073            if (tmp_rd == INT64_MAX)
4074              continue;
4075            rs = vp9_get_switchable_rate(x);
4076            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
4077            cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
4078            cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
4079                MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
4080                    tmp_rd + rs_rd);
4081            if (cm->interp_filter == SWITCHABLE)
4082              tmp_rd += rs_rd;
4083
4084            cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, tmp_rd);
4085
4086            newbest = (tmp_rd < tmp_best_rd);
4087            if (newbest) {
4088              tmp_best_filter = mbmi->interp_filter;
4089              tmp_best_rd = tmp_rd;
4090            }
4091            if ((newbest && cm->interp_filter == SWITCHABLE) ||
4092                (mbmi->interp_filter == cm->interp_filter &&
4093                 cm->interp_filter != SWITCHABLE)) {
4094              tmp_best_rdu = tmp_rd;
4095              tmp_best_rate = rate;
4096              tmp_best_ratey = rate_y;
4097              tmp_best_distortion = distortion;
4098              tmp_best_sse = total_sse;
4099              tmp_best_skippable = skippable;
4100              tmp_best_mbmode = *mbmi;
4101              for (i = 0; i < 4; i++) {
4102                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
4103                x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
4104              }
4105              pred_exists = 1;
4106              if (switchable_filter_index == 0 &&
4107                  cpi->sf.use_rd_breakout &&
4108                  best_rd < INT64_MAX) {
4109                if (tmp_best_rdu / 2 > best_rd) {
4110                  // skip searching the other filters if the first is
4111                  // already substantially larger than the best so far
4112                  tmp_best_filter = mbmi->interp_filter;
4113                  tmp_best_rdu = INT64_MAX;
4114                  break;
4115                }
4116              }
4117            }
4118          }  // switchable_filter_index loop
4119        }
4120      }
4121
4122      if (tmp_best_rdu == INT64_MAX && pred_exists)
4123        continue;
4124
4125      mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
4126                             tmp_best_filter : cm->interp_filter);
4127      if (!pred_exists) {
4128        // Handles the special case when a filter that is not in the
4129        // switchable list (bilinear, 6-tap) is indicated at the frame level
4130        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
4131                     &mbmi->ref_mvs[ref_frame][0],
4132                     second_ref,
4133                     best_yrd,
4134                     &rate, &rate_y, &distortion,
4135                     &skippable, &total_sse,
4136                     (int)this_rd_thresh, seg_mvs,
4137                     bsi, 0,
4138                     mi_row, mi_col);
4139        if (tmp_rd == INT64_MAX)
4140          continue;
4141      } else {
4142        total_sse = tmp_best_sse;
4143        rate = tmp_best_rate;
4144        rate_y = tmp_best_ratey;
4145        distortion = tmp_best_distortion;
4146        skippable = tmp_best_skippable;
4147        *mbmi = tmp_best_mbmode;
4148        for (i = 0; i < 4; i++)
4149          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
4150      }
4151
4152      rate2 += rate;
4153      distortion2 += distortion;
4154
4155      if (cm->interp_filter == SWITCHABLE)
4156        rate2 += vp9_get_switchable_rate(x);
4157
4158      if (!mode_excluded)
4159        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
4160                                  : cm->reference_mode == COMPOUND_REFERENCE;
4161
4162      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
4163
4164      tmp_best_rdu = best_rd -
4165          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
4166              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
4167
4168      if (tmp_best_rdu > 0) {
4169        // If even the 'Y' rd value of split is higher than best so far
4170        // then dont bother looking at UV
4171        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
4172                                        BLOCK_8X8);
4173        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
4174                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
4175        if (rate_uv == INT_MAX)
4176          continue;
4177        rate2 += rate_uv;
4178        distortion2 += distortion_uv;
4179        skippable = skippable && uv_skippable;
4180        total_sse += uv_sse;
4181
4182        tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4183        for (i = 0; i < TX_MODES; ++i)
4184          tx_cache[i] = tx_cache[ONLY_4X4];
4185      }
4186    }
4187
4188    if (cm->reference_mode == REFERENCE_MODE_SELECT)
4189      rate2 += compmode_cost;
4190
4191    // Estimate the reference frame signaling cost and add it
4192    // to the rolling cost variable.
4193    if (second_ref_frame > INTRA_FRAME) {
4194      rate2 += ref_costs_comp[ref_frame];
4195    } else {
4196      rate2 += ref_costs_single[ref_frame];
4197    }
4198
4199    if (!disable_skip) {
4200      // Test for the condition where skip block will be activated
4201      // because there are no non zero coefficients and make any
4202      // necessary adjustment for rate. Ignore if skip is coded at
4203      // segment level as the cost wont have been added in.
4204      // Is Mb level skip allowed (i.e. not coded at segment level).
4205      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
4206                                                         SEG_LVL_SKIP);
4207
4208      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
4209        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
4210            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
4211          // Add in the cost of the no skip flag.
4212          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4213        } else {
4214          // FIXME(rbultje) make this work for splitmv also
4215          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
4216          distortion2 = total_sse;
4217          assert(total_sse >= 0);
4218          rate2 -= (rate_y + rate_uv);
4219          rate_y = 0;
4220          rate_uv = 0;
4221          this_skip2 = 1;
4222        }
4223      } else if (mb_skip_allowed) {
4224        // Add in the cost of the no skip flag.
4225        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4226      }
4227
4228      // Calculate the final RD estimate for this mode.
4229      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4230    }
4231
4232    // Keep record of best inter rd with single reference
4233    if (is_inter_block(&xd->mi[0]->mbmi) &&
4234        !has_second_ref(&xd->mi[0]->mbmi) &&
4235        !mode_excluded &&
4236        this_rd < best_inter_rd) {
4237      best_inter_rd = this_rd;
4238      best_inter_ref_frame = ref_frame;
4239    }
4240
4241    if (!disable_skip && ref_frame == INTRA_FRAME) {
4242      for (i = 0; i < REFERENCE_MODES; ++i)
4243        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
4244      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
4245        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
4246    }
4247
4248    // Did this mode help.. i.e. is it the new best mode
4249    if (this_rd < best_rd || x->skip) {
4250      if (!mode_excluded) {
4251        int max_plane = MAX_MB_PLANE;
4252        // Note index of best mode so far
4253        best_mode_index = mode_index;
4254
4255        if (ref_frame == INTRA_FRAME) {
4256          /* required for left and above block mv */
4257          mbmi->mv[0].as_int = 0;
4258          max_plane = 1;
4259        }
4260
4261        *returnrate = rate2;
4262        *returndistortion = distortion2;
4263        best_rd = this_rd;
4264        best_yrd = best_rd -
4265                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
4266        best_mbmode = *mbmi;
4267        best_skip2 = this_skip2;
4268        if (!x->select_txfm_size)
4269          swap_block_ptr(x, ctx, max_plane);
4270        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
4271                   sizeof(uint8_t) * ctx->num_4x4_blk);
4272
4273        for (i = 0; i < 4; i++)
4274          best_bmodes[i] = xd->mi[0]->bmi[i];
4275
4276        // TODO(debargha): enhance this test with a better distortion prediction
4277        // based on qp, activity mask and history
4278        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
4279            (mode_index > MIN_EARLY_TERM_INDEX)) {
4280          const int qstep = xd->plane[0].dequant[1];
4281          // TODO(debargha): Enhance this by specializing for each mode_index
4282          int scale = 4;
4283          if (x->source_variance < UINT_MAX) {
4284            const int var_adjust = (x->source_variance < 16);
4285            scale -= var_adjust;
4286          }
4287          if (ref_frame > INTRA_FRAME &&
4288              distortion2 * scale < qstep * qstep) {
4289            early_term = 1;
4290          }
4291        }
4292      }
4293    }
4294
4295    /* keep record of best compound/single-only prediction */
4296    if (!disable_skip && ref_frame != INTRA_FRAME) {
4297      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
4298
4299      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
4300        single_rate = rate2 - compmode_cost;
4301        hybrid_rate = rate2;
4302      } else {
4303        single_rate = rate2;
4304        hybrid_rate = rate2 + compmode_cost;
4305      }
4306
4307      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
4308      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
4309
4310      if (second_ref_frame <= INTRA_FRAME &&
4311          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
4312        best_pred_rd[SINGLE_REFERENCE] = single_rd;
4313      } else if (second_ref_frame > INTRA_FRAME &&
4314                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
4315        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
4316      }
4317      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
4318        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
4319    }
4320
4321    /* keep record of best filter type */
4322    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
4323        cm->interp_filter != BILINEAR) {
4324      int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
4325                              SWITCHABLE_FILTERS : cm->interp_filter];
4326      int64_t adj_rd;
4327      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4328        if (ref == INT64_MAX)
4329          adj_rd = 0;
4330        else if (cpi->rd_filter_cache[i] == INT64_MAX)
4331          // when early termination is triggered, the encoder does not have
4332          // access to the rate-distortion cost. it only knows that the cost
4333          // should be above the maximum valid value. hence it takes the known
4334          // maximum plus an arbitrary constant as the rate-distortion cost.
4335          adj_rd = cpi->mask_filter_rd - ref + 10;
4336        else
4337          adj_rd = cpi->rd_filter_cache[i] - ref;
4338
4339        adj_rd += this_rd;
4340        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
4341      }
4342    }
4343
4344    /* keep record of best txfm size */
4345    if (bsize < BLOCK_32X32) {
4346      if (bsize < BLOCK_16X16) {
4347        tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
4348        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
4349      }
4350      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
4351    }
4352    if (!mode_excluded && this_rd != INT64_MAX) {
4353      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
4354        int64_t adj_rd = INT64_MAX;
4355        if (ref_frame > INTRA_FRAME)
4356          adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
4357        else
4358          adj_rd = this_rd;
4359
4360        if (adj_rd < best_tx_rd[i])
4361          best_tx_rd[i] = adj_rd;
4362      }
4363    }
4364
4365    if (early_term)
4366      break;
4367
4368    if (x->skip && !comp_pred)
4369      break;
4370  }
4371
4372  if (best_rd >= best_rd_so_far)
4373    return INT64_MAX;
4374
4375  // If we used an estimate for the uv intra rd in the loop above...
4376  if (cpi->sf.use_uv_intra_rd_estimate) {
4377    // Do Intra UV best rd mode selection if best mode choice above was intra.
4378    if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
4379      TX_SIZE uv_tx_size;
4380      *mbmi = best_mbmode;
4381      uv_tx_size = get_uv_tx_size(mbmi);
4382      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
4383                              &rate_uv_tokenonly[uv_tx_size],
4384                              &dist_uv[uv_tx_size],
4385                              &skip_uv[uv_tx_size],
4386                              BLOCK_8X8, uv_tx_size);
4387    }
4388  }
4389
4390  if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
4391    *returnrate = INT_MAX;
4392    *returndistortion = INT64_MAX;
4393    return best_rd;
4394  }
4395
4396  assert((cm->interp_filter == SWITCHABLE) ||
4397         (cm->interp_filter == best_mbmode.interp_filter) ||
4398         !is_inter_block(&best_mbmode));
4399
4400  // Updating rd_thresh_freq_fact[] here means that the different
4401  // partition/block sizes are handled independently based on the best
4402  // choice for the current partition. It may well be better to keep a scaled
4403  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
4404  // combination that wins out.
4405  if (cpi->sf.adaptive_rd_thresh) {
4406    for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
4407      int *const fact = &cpi->rd_thresh_freq_sub8x8[bsize][mode_index];
4408
4409      if (mode_index == best_mode_index) {
4410        *fact -= (*fact >> 3);
4411      } else {
4412        *fact = MIN(*fact + RD_THRESH_INC,
4413                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
4414      }
4415    }
4416  }
4417
4418  // macroblock modes
4419  *mbmi = best_mbmode;
4420  x->skip |= best_skip2;
4421  if (!is_inter_block(&best_mbmode)) {
4422    for (i = 0; i < 4; i++)
4423      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
4424  } else {
4425    for (i = 0; i < 4; ++i)
4426      vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
4427
4428    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
4429    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
4430  }
4431
4432  for (i = 0; i < REFERENCE_MODES; ++i) {
4433    if (best_pred_rd[i] == INT64_MAX)
4434      best_pred_diff[i] = INT_MIN;
4435    else
4436      best_pred_diff[i] = best_rd - best_pred_rd[i];
4437  }
4438
4439  if (!x->skip) {
4440    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4441      if (best_filter_rd[i] == INT64_MAX)
4442        best_filter_diff[i] = 0;
4443      else
4444        best_filter_diff[i] = best_rd - best_filter_rd[i];
4445    }
4446    if (cm->interp_filter == SWITCHABLE)
4447      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
4448  } else {
4449    vp9_zero(best_filter_diff);
4450  }
4451
4452  if (!x->skip) {
4453    for (i = 0; i < TX_MODES; i++) {
4454      if (best_tx_rd[i] == INT64_MAX)
4455        best_tx_diff[i] = 0;
4456      else
4457        best_tx_diff[i] = best_rd - best_tx_rd[i];
4458    }
4459  } else {
4460    vp9_zero(best_tx_diff);
4461  }
4462
4463  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
4464  store_coding_context(x, ctx, best_mode_index,
4465                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
4466                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
4467                                      mbmi->ref_frame[1]][0],
4468                       best_pred_diff, best_tx_diff, best_filter_diff);
4469
4470  return best_rd;
4471}
4472