1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vpx_config.h"
13#include "vp8_rtcd.h"
14#include "encodemb.h"
15#include "vp8/common/reconinter.h"
16#include "quantize.h"
17#include "tokenize.h"
18#include "vp8/common/invtrans.h"
19#include "vpx_mem/vpx_mem.h"
20#include "rdopt.h"
21
22void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
23{
24    unsigned char *src_ptr = (*(be->base_src) + be->src);
25    short *diff_ptr = be->src_diff;
26    unsigned char *pred_ptr = bd->predictor;
27    int src_stride = be->src_stride;
28
29    int r, c;
30
31    for (r = 0; r < 4; r++)
32    {
33        for (c = 0; c < 4; c++)
34        {
35            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
36        }
37
38        diff_ptr += pitch;
39        pred_ptr += pitch;
40        src_ptr  += src_stride;
41    }
42}
43
44void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
45                         int src_stride, unsigned char *upred,
46                         unsigned char *vpred, int pred_stride)
47{
48    short *udiff = diff + 256;
49    short *vdiff = diff + 320;
50
51    int r, c;
52
53    for (r = 0; r < 8; r++)
54    {
55        for (c = 0; c < 8; c++)
56        {
57            udiff[c] = usrc[c] - upred[c];
58        }
59
60        udiff += 8;
61        upred += pred_stride;
62        usrc  += src_stride;
63    }
64
65    for (r = 0; r < 8; r++)
66    {
67        for (c = 0; c < 8; c++)
68        {
69            vdiff[c] = vsrc[c] - vpred[c];
70        }
71
72        vdiff += 8;
73        vpred += pred_stride;
74        vsrc  += src_stride;
75    }
76}
77
78void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
79                        unsigned char *pred, int pred_stride)
80{
81    int r, c;
82
83    for (r = 0; r < 16; r++)
84    {
85        for (c = 0; c < 16; c++)
86        {
87            diff[c] = src[c] - pred[c];
88        }
89
90        diff += 16;
91        pred += pred_stride;
92        src  += src_stride;
93    }
94}
95
96static void vp8_subtract_mb(MACROBLOCK *x)
97{
98    BLOCK *b = &x->block[0];
99
100    vp8_subtract_mby(x->src_diff, *(b->base_src),
101        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
102    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
103        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
104        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
105}
106
107static void build_dcblock(MACROBLOCK *x)
108{
109    short *src_diff_ptr = &x->src_diff[384];
110    int i;
111
112    for (i = 0; i < 16; i++)
113    {
114        src_diff_ptr[i] = x->coeff[i * 16];
115    }
116}
117
118void vp8_transform_mbuv(MACROBLOCK *x)
119{
120    int i;
121
122    for (i = 16; i < 24; i += 2)
123    {
124        x->short_fdct8x4(&x->block[i].src_diff[0],
125            &x->block[i].coeff[0], 16);
126    }
127}
128
129
130void vp8_transform_intra_mby(MACROBLOCK *x)
131{
132    int i;
133
134    for (i = 0; i < 16; i += 2)
135    {
136        x->short_fdct8x4(&x->block[i].src_diff[0],
137            &x->block[i].coeff[0], 32);
138    }
139
140    /* build dc block from 16 y dc values */
141    build_dcblock(x);
142
143    /* do 2nd order transform on the dc block */
144    x->short_walsh4x4(&x->block[24].src_diff[0],
145        &x->block[24].coeff[0], 8);
146
147}
148
149
150static void transform_mb(MACROBLOCK *x)
151{
152    int i;
153
154    for (i = 0; i < 16; i += 2)
155    {
156        x->short_fdct8x4(&x->block[i].src_diff[0],
157            &x->block[i].coeff[0], 32);
158    }
159
160    /* build dc block from 16 y dc values */
161    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
162        build_dcblock(x);
163
164    for (i = 16; i < 24; i += 2)
165    {
166        x->short_fdct8x4(&x->block[i].src_diff[0],
167            &x->block[i].coeff[0], 16);
168    }
169
170    /* do 2nd order transform on the dc block */
171    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
172        x->short_walsh4x4(&x->block[24].src_diff[0],
173        &x->block[24].coeff[0], 8);
174
175}
176
177
178static void transform_mby(MACROBLOCK *x)
179{
180    int i;
181
182    for (i = 0; i < 16; i += 2)
183    {
184        x->short_fdct8x4(&x->block[i].src_diff[0],
185            &x->block[i].coeff[0], 32);
186    }
187
188    /* build dc block from 16 y dc values */
189    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
190    {
191        build_dcblock(x);
192        x->short_walsh4x4(&x->block[24].src_diff[0],
193            &x->block[24].coeff[0], 8);
194    }
195}
196
197
198
199#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
200
201typedef struct vp8_token_state vp8_token_state;
202
203struct vp8_token_state{
204  int           rate;
205  int           error;
206  signed char   next;
207  signed char   token;
208  short         qc;
209};
210
211/* TODO: experiments to find optimal multiple numbers */
212#define Y1_RD_MULT 4
213#define UV_RD_MULT 2
214#define Y2_RD_MULT 16
215
216static const int plane_rd_mult[4]=
217{
218    Y1_RD_MULT,
219    Y2_RD_MULT,
220    UV_RD_MULT,
221    Y1_RD_MULT
222};
223
224static void optimize_b(MACROBLOCK *mb, int ib, int type,
225                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
226{
227    BLOCK *b;
228    BLOCKD *d;
229    vp8_token_state tokens[17][2];
230    unsigned best_mask[2];
231    const short *dequant_ptr;
232    const short *coeff_ptr;
233    short *qcoeff_ptr;
234    short *dqcoeff_ptr;
235    int eob;
236    int i0;
237    int rc;
238    int x;
239    int sz = 0;
240    int next;
241    int rdmult;
242    int rddiv;
243    int final_eob;
244    int rd_cost0;
245    int rd_cost1;
246    int rate0;
247    int rate1;
248    int error0;
249    int error1;
250    int t0;
251    int t1;
252    int best;
253    int band;
254    int pt;
255    int i;
256    int err_mult = plane_rd_mult[type];
257
258    b = &mb->block[ib];
259    d = &mb->e_mbd.block[ib];
260
261    dequant_ptr = d->dequant;
262    coeff_ptr = b->coeff;
263    qcoeff_ptr = d->qcoeff;
264    dqcoeff_ptr = d->dqcoeff;
265    i0 = !type;
266    eob = *d->eob;
267
268    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
269    rdmult = mb->rdmult * err_mult;
270    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
271        rdmult = (rdmult * 9)>>4;
272
273    rddiv = mb->rddiv;
274    best_mask[0] = best_mask[1] = 0;
275    /* Initialize the sentinel node of the trellis. */
276    tokens[eob][0].rate = 0;
277    tokens[eob][0].error = 0;
278    tokens[eob][0].next = 16;
279    tokens[eob][0].token = DCT_EOB_TOKEN;
280    tokens[eob][0].qc = 0;
281    *(tokens[eob] + 1) = *(tokens[eob] + 0);
282    next = eob;
283    for (i = eob; i-- > i0;)
284    {
285        int base_bits;
286        int d2;
287        int dx;
288
289        rc = vp8_default_zig_zag1d[i];
290        x = qcoeff_ptr[rc];
291        /* Only add a trellis state for non-zero coefficients. */
292        if (x)
293        {
294            int shortcut=0;
295            error0 = tokens[next][0].error;
296            error1 = tokens[next][1].error;
297            /* Evaluate the first possibility for this state. */
298            rate0 = tokens[next][0].rate;
299            rate1 = tokens[next][1].rate;
300            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
301            /* Consider both possible successor states. */
302            if (next < 16)
303            {
304                band = vp8_coef_bands[i + 1];
305                pt = vp8_prev_token_class[t0];
306                rate0 +=
307                    mb->token_costs[type][band][pt][tokens[next][0].token];
308                rate1 +=
309                    mb->token_costs[type][band][pt][tokens[next][1].token];
310            }
311            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
312            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
313            if (rd_cost0 == rd_cost1)
314            {
315                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
316                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
317            }
318            /* And pick the best. */
319            best = rd_cost1 < rd_cost0;
320            base_bits = *(vp8_dct_value_cost_ptr + x);
321            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
322            d2 = dx*dx;
323            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
324            tokens[i][0].error = d2 + (best ? error1 : error0);
325            tokens[i][0].next = next;
326            tokens[i][0].token = t0;
327            tokens[i][0].qc = x;
328            best_mask[0] |= best << i;
329            /* Evaluate the second possibility for this state. */
330            rate0 = tokens[next][0].rate;
331            rate1 = tokens[next][1].rate;
332
333            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
334               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
335                shortcut = 1;
336            else
337                shortcut = 0;
338
339            if(shortcut)
340            {
341                sz = -(x < 0);
342                x -= 2*sz + 1;
343            }
344
345            /* Consider both possible successor states. */
346            if (!x)
347            {
348                /* If we reduced this coefficient to zero, check to see if
349                 *  we need to move the EOB back here.
350                 */
351                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
352                    DCT_EOB_TOKEN : ZERO_TOKEN;
353                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
354                    DCT_EOB_TOKEN : ZERO_TOKEN;
355            }
356            else
357            {
358                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
359            }
360            if (next < 16)
361            {
362                band = vp8_coef_bands[i + 1];
363                if(t0!=DCT_EOB_TOKEN)
364                {
365                    pt = vp8_prev_token_class[t0];
366                    rate0 += mb->token_costs[type][band][pt][
367                        tokens[next][0].token];
368                }
369                if(t1!=DCT_EOB_TOKEN)
370                {
371                    pt = vp8_prev_token_class[t1];
372                    rate1 += mb->token_costs[type][band][pt][
373                        tokens[next][1].token];
374                }
375            }
376
377            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
378            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
379            if (rd_cost0 == rd_cost1)
380            {
381                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
382                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
383            }
384            /* And pick the best. */
385            best = rd_cost1 < rd_cost0;
386            base_bits = *(vp8_dct_value_cost_ptr + x);
387
388            if(shortcut)
389            {
390                dx -= (dequant_ptr[rc] + sz) ^ sz;
391                d2 = dx*dx;
392            }
393            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
394            tokens[i][1].error = d2 + (best ? error1 : error0);
395            tokens[i][1].next = next;
396            tokens[i][1].token =best?t1:t0;
397            tokens[i][1].qc = x;
398            best_mask[1] |= best << i;
399            /* Finally, make this the new head of the trellis. */
400            next = i;
401        }
402        /* There's no choice to make for a zero coefficient, so we don't
403         *  add a new trellis node, but we do need to update the costs.
404         */
405        else
406        {
407            band = vp8_coef_bands[i + 1];
408            t0 = tokens[next][0].token;
409            t1 = tokens[next][1].token;
410            /* Update the cost of each path if we're past the EOB token. */
411            if (t0 != DCT_EOB_TOKEN)
412            {
413                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
414                tokens[next][0].token = ZERO_TOKEN;
415            }
416            if (t1 != DCT_EOB_TOKEN)
417            {
418                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
419                tokens[next][1].token = ZERO_TOKEN;
420            }
421            /* Don't update next, because we didn't add a new node. */
422        }
423    }
424
425    /* Now pick the best path through the whole trellis. */
426    band = vp8_coef_bands[i + 1];
427    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
428    rate0 = tokens[next][0].rate;
429    rate1 = tokens[next][1].rate;
430    error0 = tokens[next][0].error;
431    error1 = tokens[next][1].error;
432    t0 = tokens[next][0].token;
433    t1 = tokens[next][1].token;
434    rate0 += mb->token_costs[type][band][pt][t0];
435    rate1 += mb->token_costs[type][band][pt][t1];
436    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
437    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
438    if (rd_cost0 == rd_cost1)
439    {
440        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
441        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
442    }
443    best = rd_cost1 < rd_cost0;
444    final_eob = i0 - 1;
445    for (i = next; i < eob; i = next)
446    {
447        x = tokens[i][best].qc;
448        if (x)
449            final_eob = i;
450        rc = vp8_default_zig_zag1d[i];
451        qcoeff_ptr[rc] = x;
452        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
453        next = tokens[i][best].next;
454        best = (best_mask[best] >> i) & 1;
455    }
456    final_eob++;
457
458    *a = *l = (final_eob != !type);
459    *d->eob = (char)final_eob;
460}
461static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
462                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
463{
464    int sum=0;
465    int i;
466    BLOCKD *bd = &x->block[24];
467
468    if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
469        return;
470
471    for(i=0;i<(*bd->eob);i++)
472    {
473        int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
474        sum+= (coef>=0)?coef:-coef;
475        if(sum>=35)
476            return;
477    }
478    /**************************************************************************
479    our inverse hadamard transform effectively is weighted sum of all 16 inputs
480    with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
481    dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
482    output after inverse wht and idct will be all zero. A sum of absolute value
483    smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
484    fall between -35 and +35.
485    **************************************************************************/
486    if(sum < 35)
487    {
488        for(i=0;i<(*bd->eob);i++)
489        {
490            int rc = vp8_default_zig_zag1d[i];
491            bd->qcoeff[rc]=0;
492            bd->dqcoeff[rc]=0;
493        }
494        *bd->eob = 0;
495        *a = *l = (*bd->eob != !type);
496    }
497}
498
499static void optimize_mb(MACROBLOCK *x)
500{
501    int b;
502    int type;
503    int has_2nd_order;
504
505    ENTROPY_CONTEXT_PLANES t_above, t_left;
506    ENTROPY_CONTEXT *ta;
507    ENTROPY_CONTEXT *tl;
508
509    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
510    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
511
512    ta = (ENTROPY_CONTEXT *)&t_above;
513    tl = (ENTROPY_CONTEXT *)&t_left;
514
515    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
516        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
517    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
518
519    for (b = 0; b < 16; b++)
520    {
521        optimize_b(x, b, type,
522            ta + vp8_block2above[b], tl + vp8_block2left[b]);
523    }
524
525    for (b = 16; b < 24; b++)
526    {
527        optimize_b(x, b, PLANE_TYPE_UV,
528            ta + vp8_block2above[b], tl + vp8_block2left[b]);
529    }
530
531    if (has_2nd_order)
532    {
533        b=24;
534        optimize_b(x, b, PLANE_TYPE_Y2,
535            ta + vp8_block2above[b], tl + vp8_block2left[b]);
536        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
537            ta + vp8_block2above[b], tl + vp8_block2left[b]);
538    }
539}
540
541
542void vp8_optimize_mby(MACROBLOCK *x)
543{
544    int b;
545    int type;
546    int has_2nd_order;
547
548    ENTROPY_CONTEXT_PLANES t_above, t_left;
549    ENTROPY_CONTEXT *ta;
550    ENTROPY_CONTEXT *tl;
551
552    if (!x->e_mbd.above_context)
553        return;
554
555    if (!x->e_mbd.left_context)
556        return;
557
558    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
559    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
560
561    ta = (ENTROPY_CONTEXT *)&t_above;
562    tl = (ENTROPY_CONTEXT *)&t_left;
563
564    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
565        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
566    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
567
568    for (b = 0; b < 16; b++)
569    {
570        optimize_b(x, b, type,
571            ta + vp8_block2above[b], tl + vp8_block2left[b]);
572    }
573
574
575    if (has_2nd_order)
576    {
577        b=24;
578        optimize_b(x, b, PLANE_TYPE_Y2,
579            ta + vp8_block2above[b], tl + vp8_block2left[b]);
580        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
581            ta + vp8_block2above[b], tl + vp8_block2left[b]);
582    }
583}
584
585void vp8_optimize_mbuv(MACROBLOCK *x)
586{
587    int b;
588    ENTROPY_CONTEXT_PLANES t_above, t_left;
589    ENTROPY_CONTEXT *ta;
590    ENTROPY_CONTEXT *tl;
591
592    if (!x->e_mbd.above_context)
593        return;
594
595    if (!x->e_mbd.left_context)
596        return;
597
598    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
599    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
600
601    ta = (ENTROPY_CONTEXT *)&t_above;
602    tl = (ENTROPY_CONTEXT *)&t_left;
603
604    for (b = 16; b < 24; b++)
605    {
606        optimize_b(x, b, PLANE_TYPE_UV,
607            ta + vp8_block2above[b], tl + vp8_block2left[b]);
608    }
609}
610
611void vp8_encode_inter16x16(MACROBLOCK *x)
612{
613    vp8_build_inter_predictors_mb(&x->e_mbd);
614
615    vp8_subtract_mb(x);
616
617    transform_mb(x);
618
619    vp8_quantize_mb(x);
620
621    if (x->optimize)
622        optimize_mb(x);
623}
624
625/* this funciton is used by first pass only */
626void vp8_encode_inter16x16y(MACROBLOCK *x)
627{
628    BLOCK *b = &x->block[0];
629
630    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
631                                        x->e_mbd.dst.y_stride);
632
633    vp8_subtract_mby(x->src_diff, *(b->base_src),
634        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
635
636    transform_mby(x);
637
638    vp8_quantize_mby(x);
639
640    vp8_inverse_transform_mby(&x->e_mbd);
641}
642