1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vpx_ports/config.h"
13#include "encodemb.h"
14#include "vp8/common/reconinter.h"
15#include "quantize.h"
16#include "tokenize.h"
17#include "vp8/common/invtrans.h"
18#include "vp8/common/recon.h"
19#include "vp8/common/reconintra.h"
20#include "dct.h"
21#include "vpx_mem/vpx_mem.h"
22
23#if CONFIG_RUNTIME_CPU_DETECT
24#define IF_RTCD(x) (x)
25#else
26#define IF_RTCD(x) NULL
27#endif
28void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
29{
30    unsigned char *src_ptr = (*(be->base_src) + be->src);
31    short *diff_ptr = be->src_diff;
32    unsigned char *pred_ptr = bd->predictor;
33    int src_stride = be->src_stride;
34
35    int r, c;
36
37    for (r = 0; r < 4; r++)
38    {
39        for (c = 0; c < 4; c++)
40        {
41            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
42        }
43
44        diff_ptr += pitch;
45        pred_ptr += pitch;
46        src_ptr  += src_stride;
47    }
48}
49
50void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
51{
52    short *udiff = diff + 256;
53    short *vdiff = diff + 320;
54    unsigned char *upred = pred + 256;
55    unsigned char *vpred = pred + 320;
56
57    int r, c;
58
59    for (r = 0; r < 8; r++)
60    {
61        for (c = 0; c < 8; c++)
62        {
63            udiff[c] = usrc[c] - upred[c];
64        }
65
66        udiff += 8;
67        upred += 8;
68        usrc  += stride;
69    }
70
71    for (r = 0; r < 8; r++)
72    {
73        for (c = 0; c < 8; c++)
74        {
75            vdiff[c] = vsrc[c] - vpred[c];
76        }
77
78        vdiff += 8;
79        vpred += 8;
80        vsrc  += stride;
81    }
82}
83
84void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
85{
86    int r, c;
87
88    for (r = 0; r < 16; r++)
89    {
90        for (c = 0; c < 16; c++)
91        {
92            diff[c] = src[c] - pred[c];
93        }
94
95        diff += 16;
96        pred += 16;
97        src  += stride;
98    }
99}
100
101static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
102{
103    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
104    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
105}
106
107static void build_dcblock(MACROBLOCK *x)
108{
109    short *src_diff_ptr = &x->src_diff[384];
110    int i;
111
112    for (i = 0; i < 16; i++)
113    {
114        src_diff_ptr[i] = x->coeff[i * 16];
115    }
116}
117
118void vp8_transform_mbuv(MACROBLOCK *x)
119{
120    int i;
121
122    for (i = 16; i < 24; i += 2)
123    {
124        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
125            &x->block[i].coeff[0], 16);
126    }
127}
128
129
130void vp8_transform_intra_mby(MACROBLOCK *x)
131{
132    int i;
133
134    for (i = 0; i < 16; i += 2)
135    {
136        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
137            &x->block[i].coeff[0], 32);
138    }
139
140    // build dc block from 16 y dc values
141    build_dcblock(x);
142
143    // do 2nd order transform on the dc block
144    x->short_walsh4x4(&x->block[24].src_diff[0],
145        &x->block[24].coeff[0], 8);
146
147}
148
149
150static void transform_mb(MACROBLOCK *x)
151{
152    int i;
153
154    for (i = 0; i < 16; i += 2)
155    {
156        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
157            &x->block[i].coeff[0], 32);
158    }
159
160    // build dc block from 16 y dc values
161    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
162        build_dcblock(x);
163
164    for (i = 16; i < 24; i += 2)
165    {
166        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
167            &x->block[i].coeff[0], 16);
168    }
169
170    // do 2nd order transform on the dc block
171    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
172        x->short_walsh4x4(&x->block[24].src_diff[0],
173        &x->block[24].coeff[0], 8);
174
175}
176
177
178static void transform_mby(MACROBLOCK *x)
179{
180    int i;
181
182    for (i = 0; i < 16; i += 2)
183    {
184        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
185            &x->block[i].coeff[0], 32);
186    }
187
188    // build dc block from 16 y dc values
189    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
190    {
191        build_dcblock(x);
192        x->short_walsh4x4(&x->block[24].src_diff[0],
193            &x->block[24].coeff[0], 8);
194    }
195}
196
197
198void vp8_stuff_inter16x16(MACROBLOCK *x)
199{
200    vp8_build_inter_predictors_mb_s(&x->e_mbd);
201    /*
202        // recon = copy from predictors to destination
203        {
204            BLOCKD *b = &x->e_mbd.block[0];
205            unsigned char *pred_ptr = b->predictor;
206            unsigned char *dst_ptr = *(b->base_dst) + b->dst;
207            int stride = b->dst_stride;
208
209            int i;
210            for(i=0;i<16;i++)
211                vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
212
213            b = &x->e_mbd.block[16];
214            pred_ptr = b->predictor;
215            dst_ptr = *(b->base_dst) + b->dst;
216            stride = b->dst_stride;
217
218            for(i=0;i<8;i++)
219                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
220
221            b = &x->e_mbd.block[20];
222            pred_ptr = b->predictor;
223            dst_ptr = *(b->base_dst) + b->dst;
224            stride = b->dst_stride;
225
226            for(i=0;i<8;i++)
227                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
228        }
229    */
230}
231
232#if !(CONFIG_REALTIME_ONLY)
233#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
234#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
235
236typedef struct vp8_token_state vp8_token_state;
237
238struct vp8_token_state{
239  int           rate;
240  int           error;
241  signed char   next;
242  signed char   token;
243  short         qc;
244};
245
246// TODO: experiments to find optimal multiple numbers
247#define Y1_RD_MULT 4
248#define UV_RD_MULT 2
249#define Y2_RD_MULT 16
250
251static const int plane_rd_mult[4]=
252{
253    Y1_RD_MULT,
254    Y2_RD_MULT,
255    UV_RD_MULT,
256    Y1_RD_MULT
257};
258
259static void optimize_b(MACROBLOCK *mb, int ib, int type,
260                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
261                       const VP8_ENCODER_RTCD *rtcd)
262{
263    BLOCK *b;
264    BLOCKD *d;
265    vp8_token_state tokens[17][2];
266    unsigned best_mask[2];
267    const short *dequant_ptr;
268    const short *coeff_ptr;
269    short *qcoeff_ptr;
270    short *dqcoeff_ptr;
271    int eob;
272    int i0;
273    int rc;
274    int x;
275    int sz;
276    int next;
277    int rdmult;
278    int rddiv;
279    int final_eob;
280    int rd_cost0;
281    int rd_cost1;
282    int rate0;
283    int rate1;
284    int error0;
285    int error1;
286    int t0;
287    int t1;
288    int best;
289    int band;
290    int pt;
291    int i;
292    int err_mult = plane_rd_mult[type];
293
294    b = &mb->block[ib];
295    d = &mb->e_mbd.block[ib];
296
297    /* Enable this to test the effect of RDO as a replacement for the dynamic
298     *  zero bin instead of an augmentation of it.
299     */
300#if 0
301    vp8_strict_quantize_b(b, d);
302#endif
303
304    dequant_ptr = d->dequant;
305    coeff_ptr = b->coeff;
306    qcoeff_ptr = d->qcoeff;
307    dqcoeff_ptr = d->dqcoeff;
308    i0 = !type;
309    eob = d->eob;
310
311    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
312    rdmult = mb->rdmult * err_mult;
313    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
314        rdmult = (rdmult * 9)>>4;
315
316    rddiv = mb->rddiv;
317    best_mask[0] = best_mask[1] = 0;
318    /* Initialize the sentinel node of the trellis. */
319    tokens[eob][0].rate = 0;
320    tokens[eob][0].error = 0;
321    tokens[eob][0].next = 16;
322    tokens[eob][0].token = DCT_EOB_TOKEN;
323    tokens[eob][0].qc = 0;
324    *(tokens[eob] + 1) = *(tokens[eob] + 0);
325    next = eob;
326    for (i = eob; i-- > i0;)
327    {
328        int base_bits;
329        int d2;
330        int dx;
331
332        rc = vp8_default_zig_zag1d[i];
333        x = qcoeff_ptr[rc];
334        /* Only add a trellis state for non-zero coefficients. */
335        if (x)
336        {
337            int shortcut=0;
338            error0 = tokens[next][0].error;
339            error1 = tokens[next][1].error;
340            /* Evaluate the first possibility for this state. */
341            rate0 = tokens[next][0].rate;
342            rate1 = tokens[next][1].rate;
343            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
344            /* Consider both possible successor states. */
345            if (next < 16)
346            {
347                band = vp8_coef_bands[i + 1];
348                pt = vp8_prev_token_class[t0];
349                rate0 +=
350                    mb->token_costs[type][band][pt][tokens[next][0].token];
351                rate1 +=
352                    mb->token_costs[type][band][pt][tokens[next][1].token];
353            }
354            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
355            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
356            if (rd_cost0 == rd_cost1)
357            {
358                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
359                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
360            }
361            /* And pick the best. */
362            best = rd_cost1 < rd_cost0;
363            base_bits = *(vp8_dct_value_cost_ptr + x);
364            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
365            d2 = dx*dx;
366            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
367            tokens[i][0].error = d2 + (best ? error1 : error0);
368            tokens[i][0].next = next;
369            tokens[i][0].token = t0;
370            tokens[i][0].qc = x;
371            best_mask[0] |= best << i;
372            /* Evaluate the second possibility for this state. */
373            rate0 = tokens[next][0].rate;
374            rate1 = tokens[next][1].rate;
375
376            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
377               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
378                shortcut = 1;
379            else
380                shortcut = 0;
381
382            if(shortcut)
383            {
384                sz = -(x < 0);
385                x -= 2*sz + 1;
386            }
387
388            /* Consider both possible successor states. */
389            if (!x)
390            {
391                /* If we reduced this coefficient to zero, check to see if
392                 *  we need to move the EOB back here.
393                 */
394                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
395                    DCT_EOB_TOKEN : ZERO_TOKEN;
396                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
397                    DCT_EOB_TOKEN : ZERO_TOKEN;
398            }
399            else
400            {
401                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
402            }
403            if (next < 16)
404            {
405                band = vp8_coef_bands[i + 1];
406                if(t0!=DCT_EOB_TOKEN)
407                {
408                    pt = vp8_prev_token_class[t0];
409                    rate0 += mb->token_costs[type][band][pt][
410                        tokens[next][0].token];
411                }
412                if(t1!=DCT_EOB_TOKEN)
413                {
414                    pt = vp8_prev_token_class[t1];
415                    rate1 += mb->token_costs[type][band][pt][
416                        tokens[next][1].token];
417                }
418            }
419
420            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
421            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
422            if (rd_cost0 == rd_cost1)
423            {
424                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
425                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
426            }
427            /* And pick the best. */
428            best = rd_cost1 < rd_cost0;
429            base_bits = *(vp8_dct_value_cost_ptr + x);
430
431            if(shortcut)
432            {
433                dx -= (dequant_ptr[rc] + sz) ^ sz;
434                d2 = dx*dx;
435            }
436            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
437            tokens[i][1].error = d2 + (best ? error1 : error0);
438            tokens[i][1].next = next;
439            tokens[i][1].token =best?t1:t0;
440            tokens[i][1].qc = x;
441            best_mask[1] |= best << i;
442            /* Finally, make this the new head of the trellis. */
443            next = i;
444        }
445        /* There's no choice to make for a zero coefficient, so we don't
446         *  add a new trellis node, but we do need to update the costs.
447         */
448        else
449        {
450            band = vp8_coef_bands[i + 1];
451            t0 = tokens[next][0].token;
452            t1 = tokens[next][1].token;
453            /* Update the cost of each path if we're past the EOB token. */
454            if (t0 != DCT_EOB_TOKEN)
455            {
456                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
457                tokens[next][0].token = ZERO_TOKEN;
458            }
459            if (t1 != DCT_EOB_TOKEN)
460            {
461                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
462                tokens[next][1].token = ZERO_TOKEN;
463            }
464            /* Don't update next, because we didn't add a new node. */
465        }
466    }
467
468    /* Now pick the best path through the whole trellis. */
469    band = vp8_coef_bands[i + 1];
470    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
471    rate0 = tokens[next][0].rate;
472    rate1 = tokens[next][1].rate;
473    error0 = tokens[next][0].error;
474    error1 = tokens[next][1].error;
475    t0 = tokens[next][0].token;
476    t1 = tokens[next][1].token;
477    rate0 += mb->token_costs[type][band][pt][t0];
478    rate1 += mb->token_costs[type][band][pt][t1];
479    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
480    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
481    if (rd_cost0 == rd_cost1)
482    {
483        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
484        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
485    }
486    best = rd_cost1 < rd_cost0;
487    final_eob = i0 - 1;
488    for (i = next; i < eob; i = next)
489    {
490        x = tokens[i][best].qc;
491        if (x)
492            final_eob = i;
493        rc = vp8_default_zig_zag1d[i];
494        qcoeff_ptr[rc] = x;
495        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
496        next = tokens[i][best].next;
497        best = (best_mask[best] >> i) & 1;
498    }
499    final_eob++;
500
501    d->eob = final_eob;
502    *a = *l = (d->eob != !type);
503}
504
505static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
506{
507    int b;
508    int type;
509    int has_2nd_order;
510    ENTROPY_CONTEXT_PLANES t_above, t_left;
511    ENTROPY_CONTEXT *ta;
512    ENTROPY_CONTEXT *tl;
513
514    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
515    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
516
517    ta = (ENTROPY_CONTEXT *)&t_above;
518    tl = (ENTROPY_CONTEXT *)&t_left;
519
520    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
521        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
522    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
523
524    for (b = 0; b < 16; b++)
525    {
526        optimize_b(x, b, type,
527            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
528    }
529
530    for (b = 16; b < 24; b++)
531    {
532        optimize_b(x, b, PLANE_TYPE_UV,
533            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
534    }
535
536    if (has_2nd_order)
537    {
538        b=24;
539        optimize_b(x, b, PLANE_TYPE_Y2,
540            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
541    }
542}
543
544
545void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
546{
547    int b;
548    int type;
549    int has_2nd_order;
550
551    ENTROPY_CONTEXT_PLANES t_above, t_left;
552    ENTROPY_CONTEXT *ta;
553    ENTROPY_CONTEXT *tl;
554
555    if (!x->e_mbd.above_context)
556        return;
557
558    if (!x->e_mbd.left_context)
559        return;
560
561    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
562    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
563
564    ta = (ENTROPY_CONTEXT *)&t_above;
565    tl = (ENTROPY_CONTEXT *)&t_left;
566
567    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
568        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
569    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
570
571    for (b = 0; b < 16; b++)
572    {
573        optimize_b(x, b, type,
574        ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
575    }
576
577
578    if (has_2nd_order)
579    {
580        b=24;
581        optimize_b(x, b, PLANE_TYPE_Y2,
582            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
583    }
584}
585
586void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
587{
588    int b;
589    ENTROPY_CONTEXT_PLANES t_above, t_left;
590    ENTROPY_CONTEXT *ta;
591    ENTROPY_CONTEXT *tl;
592
593    if (!x->e_mbd.above_context)
594        return;
595
596    if (!x->e_mbd.left_context)
597        return;
598
599    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
600    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
601
602    ta = (ENTROPY_CONTEXT *)&t_above;
603    tl = (ENTROPY_CONTEXT *)&t_left;
604
605    for (b = 16; b < 24; b++)
606    {
607        optimize_b(x, b, PLANE_TYPE_UV,
608            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
609    }
610}
611#endif
612
613void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
614{
615    vp8_build_inter_predictors_mb(&x->e_mbd);
616
617    vp8_subtract_mb(rtcd, x);
618
619    transform_mb(x);
620
621    vp8_quantize_mb(x);
622
623#if !(CONFIG_REALTIME_ONLY)
624    if (x->optimize)
625        optimize_mb(x, rtcd);
626#endif
627
628    vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
629
630    RECON_INVOKE(&rtcd->common->recon, recon_mb)
631        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
632}
633
634
635/* this funciton is used by first pass only */
636void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
637{
638    vp8_build_inter_predictors_mby(&x->e_mbd);
639
640    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
641
642    transform_mby(x);
643
644    vp8_quantize_mby(x);
645
646    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
647
648    RECON_INVOKE(&rtcd->common->recon, recon_mby)
649        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
650}
651
652
653void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
654{
655    vp8_build_inter_predictors_mbuv(&x->e_mbd);
656    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
657
658    vp8_transform_mbuv(x);
659
660    vp8_quantize_mbuv(x);
661
662}
663