1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12
13#include "vpx_config.h"
14#include "vp8_rtcd.h"
15#include "encodemb.h"
16#include "vp8/common/reconinter.h"
17#include "vp8/encoder/quantize.h"
18#include "tokenize.h"
19#include "vp8/common/invtrans.h"
20#include "vpx_mem/vpx_mem.h"
21#include "rdopt.h"
22
23void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
24  unsigned char *src_ptr = (*(be->base_src) + be->src);
25  short *diff_ptr = be->src_diff;
26  unsigned char *pred_ptr = bd->predictor;
27  int src_stride = be->src_stride;
28
29  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
30                     pred_ptr, pitch);
31}
32
33void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
34                         int src_stride, unsigned char *upred,
35                         unsigned char *vpred, int pred_stride) {
36  short *udiff = diff + 256;
37  short *vdiff = diff + 320;
38
39  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
40  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
41}
42
43void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
44                      unsigned char *pred, int pred_stride) {
45  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
46}
47
48static void vp8_subtract_mb(MACROBLOCK *x)
49{
50    BLOCK *b = &x->block[0];
51
52    vp8_subtract_mby(x->src_diff, *(b->base_src),
53        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
54    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
55        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
56        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
57}
58
59static void build_dcblock(MACROBLOCK *x)
60{
61    short *src_diff_ptr = &x->src_diff[384];
62    int i;
63
64    for (i = 0; i < 16; i++)
65    {
66        src_diff_ptr[i] = x->coeff[i * 16];
67    }
68}
69
70void vp8_transform_mbuv(MACROBLOCK *x)
71{
72    int i;
73
74    for (i = 16; i < 24; i += 2)
75    {
76        x->short_fdct8x4(&x->block[i].src_diff[0],
77            &x->block[i].coeff[0], 16);
78    }
79}
80
81
82void vp8_transform_intra_mby(MACROBLOCK *x)
83{
84    int i;
85
86    for (i = 0; i < 16; i += 2)
87    {
88        x->short_fdct8x4(&x->block[i].src_diff[0],
89            &x->block[i].coeff[0], 32);
90    }
91
92    /* build dc block from 16 y dc values */
93    build_dcblock(x);
94
95    /* do 2nd order transform on the dc block */
96    x->short_walsh4x4(&x->block[24].src_diff[0],
97        &x->block[24].coeff[0], 8);
98
99}
100
101
102static void transform_mb(MACROBLOCK *x)
103{
104    int i;
105
106    for (i = 0; i < 16; i += 2)
107    {
108        x->short_fdct8x4(&x->block[i].src_diff[0],
109            &x->block[i].coeff[0], 32);
110    }
111
112    /* build dc block from 16 y dc values */
113    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
114        build_dcblock(x);
115
116    for (i = 16; i < 24; i += 2)
117    {
118        x->short_fdct8x4(&x->block[i].src_diff[0],
119            &x->block[i].coeff[0], 16);
120    }
121
122    /* do 2nd order transform on the dc block */
123    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
124        x->short_walsh4x4(&x->block[24].src_diff[0],
125        &x->block[24].coeff[0], 8);
126
127}
128
129
130static void transform_mby(MACROBLOCK *x)
131{
132    int i;
133
134    for (i = 0; i < 16; i += 2)
135    {
136        x->short_fdct8x4(&x->block[i].src_diff[0],
137            &x->block[i].coeff[0], 32);
138    }
139
140    /* build dc block from 16 y dc values */
141    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
142    {
143        build_dcblock(x);
144        x->short_walsh4x4(&x->block[24].src_diff[0],
145            &x->block[24].coeff[0], 8);
146    }
147}
148
149
150
151#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
152
153typedef struct vp8_token_state vp8_token_state;
154
155struct vp8_token_state{
156  int           rate;
157  int           error;
158  signed char   next;
159  signed char   token;
160  short         qc;
161};
162
163/* TODO: experiments to find optimal multiple numbers */
164#define Y1_RD_MULT 4
165#define UV_RD_MULT 2
166#define Y2_RD_MULT 16
167
168static const int plane_rd_mult[4]=
169{
170    Y1_RD_MULT,
171    Y2_RD_MULT,
172    UV_RD_MULT,
173    Y1_RD_MULT
174};
175
176static void optimize_b(MACROBLOCK *mb, int ib, int type,
177                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
178{
179    BLOCK *b;
180    BLOCKD *d;
181    vp8_token_state tokens[17][2];
182    unsigned best_mask[2];
183    const short *dequant_ptr;
184    const short *coeff_ptr;
185    short *qcoeff_ptr;
186    short *dqcoeff_ptr;
187    int eob;
188    int i0;
189    int rc;
190    int x;
191    int sz = 0;
192    int next;
193    int rdmult;
194    int rddiv;
195    int final_eob;
196    int rd_cost0;
197    int rd_cost1;
198    int rate0;
199    int rate1;
200    int error0;
201    int error1;
202    int t0;
203    int t1;
204    int best;
205    int band;
206    int pt;
207    int i;
208    int err_mult = plane_rd_mult[type];
209
210    b = &mb->block[ib];
211    d = &mb->e_mbd.block[ib];
212
213    dequant_ptr = d->dequant;
214    coeff_ptr = b->coeff;
215    qcoeff_ptr = d->qcoeff;
216    dqcoeff_ptr = d->dqcoeff;
217    i0 = !type;
218    eob = *d->eob;
219
220    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
221    rdmult = mb->rdmult * err_mult;
222    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
223        rdmult = (rdmult * 9)>>4;
224
225    rddiv = mb->rddiv;
226    best_mask[0] = best_mask[1] = 0;
227    /* Initialize the sentinel node of the trellis. */
228    tokens[eob][0].rate = 0;
229    tokens[eob][0].error = 0;
230    tokens[eob][0].next = 16;
231    tokens[eob][0].token = DCT_EOB_TOKEN;
232    tokens[eob][0].qc = 0;
233    *(tokens[eob] + 1) = *(tokens[eob] + 0);
234    next = eob;
235    for (i = eob; i-- > i0;)
236    {
237        int base_bits;
238        int d2;
239        int dx;
240
241        rc = vp8_default_zig_zag1d[i];
242        x = qcoeff_ptr[rc];
243        /* Only add a trellis state for non-zero coefficients. */
244        if (x)
245        {
246            int shortcut=0;
247            error0 = tokens[next][0].error;
248            error1 = tokens[next][1].error;
249            /* Evaluate the first possibility for this state. */
250            rate0 = tokens[next][0].rate;
251            rate1 = tokens[next][1].rate;
252            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
253            /* Consider both possible successor states. */
254            if (next < 16)
255            {
256                band = vp8_coef_bands[i + 1];
257                pt = vp8_prev_token_class[t0];
258                rate0 +=
259                    mb->token_costs[type][band][pt][tokens[next][0].token];
260                rate1 +=
261                    mb->token_costs[type][band][pt][tokens[next][1].token];
262            }
263            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
264            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
265            if (rd_cost0 == rd_cost1)
266            {
267                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
268                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
269            }
270            /* And pick the best. */
271            best = rd_cost1 < rd_cost0;
272            base_bits = *(vp8_dct_value_cost_ptr + x);
273            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
274            d2 = dx*dx;
275            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
276            tokens[i][0].error = d2 + (best ? error1 : error0);
277            tokens[i][0].next = next;
278            tokens[i][0].token = t0;
279            tokens[i][0].qc = x;
280            best_mask[0] |= best << i;
281            /* Evaluate the second possibility for this state. */
282            rate0 = tokens[next][0].rate;
283            rate1 = tokens[next][1].rate;
284
285            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
286               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
287                shortcut = 1;
288            else
289                shortcut = 0;
290
291            if(shortcut)
292            {
293                sz = -(x < 0);
294                x -= 2*sz + 1;
295            }
296
297            /* Consider both possible successor states. */
298            if (!x)
299            {
300                /* If we reduced this coefficient to zero, check to see if
301                 *  we need to move the EOB back here.
302                 */
303                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
304                    DCT_EOB_TOKEN : ZERO_TOKEN;
305                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
306                    DCT_EOB_TOKEN : ZERO_TOKEN;
307            }
308            else
309            {
310                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
311            }
312            if (next < 16)
313            {
314                band = vp8_coef_bands[i + 1];
315                if(t0!=DCT_EOB_TOKEN)
316                {
317                    pt = vp8_prev_token_class[t0];
318                    rate0 += mb->token_costs[type][band][pt][
319                        tokens[next][0].token];
320                }
321                if(t1!=DCT_EOB_TOKEN)
322                {
323                    pt = vp8_prev_token_class[t1];
324                    rate1 += mb->token_costs[type][band][pt][
325                        tokens[next][1].token];
326                }
327            }
328
329            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
330            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
331            if (rd_cost0 == rd_cost1)
332            {
333                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
334                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
335            }
336            /* And pick the best. */
337            best = rd_cost1 < rd_cost0;
338            base_bits = *(vp8_dct_value_cost_ptr + x);
339
340            if(shortcut)
341            {
342                dx -= (dequant_ptr[rc] + sz) ^ sz;
343                d2 = dx*dx;
344            }
345            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
346            tokens[i][1].error = d2 + (best ? error1 : error0);
347            tokens[i][1].next = next;
348            tokens[i][1].token =best?t1:t0;
349            tokens[i][1].qc = x;
350            best_mask[1] |= best << i;
351            /* Finally, make this the new head of the trellis. */
352            next = i;
353        }
354        /* There's no choice to make for a zero coefficient, so we don't
355         *  add a new trellis node, but we do need to update the costs.
356         */
357        else
358        {
359            band = vp8_coef_bands[i + 1];
360            t0 = tokens[next][0].token;
361            t1 = tokens[next][1].token;
362            /* Update the cost of each path if we're past the EOB token. */
363            if (t0 != DCT_EOB_TOKEN)
364            {
365                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
366                tokens[next][0].token = ZERO_TOKEN;
367            }
368            if (t1 != DCT_EOB_TOKEN)
369            {
370                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
371                tokens[next][1].token = ZERO_TOKEN;
372            }
373            /* Don't update next, because we didn't add a new node. */
374        }
375    }
376
377    /* Now pick the best path through the whole trellis. */
378    band = vp8_coef_bands[i + 1];
379    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
380    rate0 = tokens[next][0].rate;
381    rate1 = tokens[next][1].rate;
382    error0 = tokens[next][0].error;
383    error1 = tokens[next][1].error;
384    t0 = tokens[next][0].token;
385    t1 = tokens[next][1].token;
386    rate0 += mb->token_costs[type][band][pt][t0];
387    rate1 += mb->token_costs[type][band][pt][t1];
388    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
389    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
390    if (rd_cost0 == rd_cost1)
391    {
392        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
393        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
394    }
395    best = rd_cost1 < rd_cost0;
396    final_eob = i0 - 1;
397    for (i = next; i < eob; i = next)
398    {
399        x = tokens[i][best].qc;
400        if (x)
401            final_eob = i;
402        rc = vp8_default_zig_zag1d[i];
403        qcoeff_ptr[rc] = x;
404        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
405        next = tokens[i][best].next;
406        best = (best_mask[best] >> i) & 1;
407    }
408    final_eob++;
409
410    *a = *l = (final_eob != !type);
411    *d->eob = (char)final_eob;
412}
413static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
414                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
415{
416    int sum=0;
417    int i;
418    BLOCKD *bd = &x->block[24];
419
420    if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
421        return;
422
423    for(i=0;i<(*bd->eob);i++)
424    {
425        int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
426        sum+= (coef>=0)?coef:-coef;
427        if(sum>=35)
428            return;
429    }
430    /**************************************************************************
431    our inverse hadamard transform effectively is weighted sum of all 16 inputs
432    with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
433    dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
434    output after inverse wht and idct will be all zero. A sum of absolute value
435    smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
436    fall between -35 and +35.
437    **************************************************************************/
438    if(sum < 35)
439    {
440        for(i=0;i<(*bd->eob);i++)
441        {
442            int rc = vp8_default_zig_zag1d[i];
443            bd->qcoeff[rc]=0;
444            bd->dqcoeff[rc]=0;
445        }
446        *bd->eob = 0;
447        *a = *l = (*bd->eob != !type);
448    }
449}
450
451static void optimize_mb(MACROBLOCK *x)
452{
453    int b;
454    int type;
455    int has_2nd_order;
456
457    ENTROPY_CONTEXT_PLANES t_above, t_left;
458    ENTROPY_CONTEXT *ta;
459    ENTROPY_CONTEXT *tl;
460
461    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
462    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
463
464    ta = (ENTROPY_CONTEXT *)&t_above;
465    tl = (ENTROPY_CONTEXT *)&t_left;
466
467    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
468        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
469    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
470
471    for (b = 0; b < 16; b++)
472    {
473        optimize_b(x, b, type,
474            ta + vp8_block2above[b], tl + vp8_block2left[b]);
475    }
476
477    for (b = 16; b < 24; b++)
478    {
479        optimize_b(x, b, PLANE_TYPE_UV,
480            ta + vp8_block2above[b], tl + vp8_block2left[b]);
481    }
482
483    if (has_2nd_order)
484    {
485        b=24;
486        optimize_b(x, b, PLANE_TYPE_Y2,
487            ta + vp8_block2above[b], tl + vp8_block2left[b]);
488        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
489            ta + vp8_block2above[b], tl + vp8_block2left[b]);
490    }
491}
492
493
494void vp8_optimize_mby(MACROBLOCK *x)
495{
496    int b;
497    int type;
498    int has_2nd_order;
499
500    ENTROPY_CONTEXT_PLANES t_above, t_left;
501    ENTROPY_CONTEXT *ta;
502    ENTROPY_CONTEXT *tl;
503
504    if (!x->e_mbd.above_context)
505        return;
506
507    if (!x->e_mbd.left_context)
508        return;
509
510    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
511    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
512
513    ta = (ENTROPY_CONTEXT *)&t_above;
514    tl = (ENTROPY_CONTEXT *)&t_left;
515
516    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
517        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
518    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
519
520    for (b = 0; b < 16; b++)
521    {
522        optimize_b(x, b, type,
523            ta + vp8_block2above[b], tl + vp8_block2left[b]);
524    }
525
526
527    if (has_2nd_order)
528    {
529        b=24;
530        optimize_b(x, b, PLANE_TYPE_Y2,
531            ta + vp8_block2above[b], tl + vp8_block2left[b]);
532        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
533            ta + vp8_block2above[b], tl + vp8_block2left[b]);
534    }
535}
536
537void vp8_optimize_mbuv(MACROBLOCK *x)
538{
539    int b;
540    ENTROPY_CONTEXT_PLANES t_above, t_left;
541    ENTROPY_CONTEXT *ta;
542    ENTROPY_CONTEXT *tl;
543
544    if (!x->e_mbd.above_context)
545        return;
546
547    if (!x->e_mbd.left_context)
548        return;
549
550    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
551    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
552
553    ta = (ENTROPY_CONTEXT *)&t_above;
554    tl = (ENTROPY_CONTEXT *)&t_left;
555
556    for (b = 16; b < 24; b++)
557    {
558        optimize_b(x, b, PLANE_TYPE_UV,
559            ta + vp8_block2above[b], tl + vp8_block2left[b]);
560    }
561}
562
563void vp8_encode_inter16x16(MACROBLOCK *x)
564{
565    vp8_build_inter_predictors_mb(&x->e_mbd);
566
567    vp8_subtract_mb(x);
568
569    transform_mb(x);
570
571    vp8_quantize_mb(x);
572
573    if (x->optimize)
574        optimize_mb(x);
575}
576
577/* this funciton is used by first pass only */
578void vp8_encode_inter16x16y(MACROBLOCK *x)
579{
580    BLOCK *b = &x->block[0];
581
582    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
583                                        x->e_mbd.dst.y_stride);
584
585    vp8_subtract_mby(x->src_diff, *(b->base_src),
586        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
587
588    transform_mby(x);
589
590    vp8_quantize_mby(x);
591
592    vp8_inverse_transform_mby(&x->e_mbd);
593}
594