1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vpx_config.h"
13#include "vp8_rtcd.h"
14#include "encodemb.h"
15#include "vp8/common/reconinter.h"
16#include "quantize.h"
17#include "tokenize.h"
18#include "vp8/common/invtrans.h"
19#include "vpx_mem/vpx_mem.h"
20#include "rdopt.h"
21
22void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
23{
24    unsigned char *src_ptr = (*(be->base_src) + be->src);
25    short *diff_ptr = be->src_diff;
26    unsigned char *pred_ptr = bd->predictor;
27    int src_stride = be->src_stride;
28
29    int r, c;
30
31    for (r = 0; r < 4; r++)
32    {
33        for (c = 0; c < 4; c++)
34        {
35            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
36        }
37
38        diff_ptr += pitch;
39        pred_ptr += pitch;
40        src_ptr  += src_stride;
41    }
42}
43
44void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
45                         int src_stride, unsigned char *upred,
46                         unsigned char *vpred, int pred_stride)
47{
48    short *udiff = diff + 256;
49    short *vdiff = diff + 320;
50
51    int r, c;
52
53    for (r = 0; r < 8; r++)
54    {
55        for (c = 0; c < 8; c++)
56        {
57            udiff[c] = usrc[c] - upred[c];
58        }
59
60        udiff += 8;
61        upred += pred_stride;
62        usrc  += src_stride;
63    }
64
65    for (r = 0; r < 8; r++)
66    {
67        for (c = 0; c < 8; c++)
68        {
69            vdiff[c] = vsrc[c] - vpred[c];
70        }
71
72        vdiff += 8;
73        vpred += pred_stride;
74        vsrc  += src_stride;
75    }
76}
77
78void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
79                        unsigned char *pred, int pred_stride)
80{
81    int r, c;
82
83    for (r = 0; r < 16; r++)
84    {
85        for (c = 0; c < 16; c++)
86        {
87            diff[c] = src[c] - pred[c];
88        }
89
90        diff += 16;
91        pred += pred_stride;
92        src  += src_stride;
93    }
94}
95
96static void vp8_subtract_mb(MACROBLOCK *x)
97{
98    BLOCK *b = &x->block[0];
99
100    vp8_subtract_mby(x->src_diff, *(b->base_src),
101        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
102    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
103        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
104        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
105}
106
107static void build_dcblock(MACROBLOCK *x)
108{
109    short *src_diff_ptr = &x->src_diff[384];
110    int i;
111
112    for (i = 0; i < 16; i++)
113    {
114        src_diff_ptr[i] = x->coeff[i * 16];
115    }
116}
117
118void vp8_transform_mbuv(MACROBLOCK *x)
119{
120    int i;
121
122    for (i = 16; i < 24; i += 2)
123    {
124        x->short_fdct8x4(&x->block[i].src_diff[0],
125            &x->block[i].coeff[0], 16);
126    }
127}
128
129
130void vp8_transform_intra_mby(MACROBLOCK *x)
131{
132    int i;
133
134    for (i = 0; i < 16; i += 2)
135    {
136        x->short_fdct8x4(&x->block[i].src_diff[0],
137            &x->block[i].coeff[0], 32);
138    }
139
140    /* build dc block from 16 y dc values */
141    build_dcblock(x);
142
143    /* do 2nd order transform on the dc block */
144    x->short_walsh4x4(&x->block[24].src_diff[0],
145        &x->block[24].coeff[0], 8);
146
147}
148
149
150static void transform_mb(MACROBLOCK *x)
151{
152    int i;
153
154    for (i = 0; i < 16; i += 2)
155    {
156        x->short_fdct8x4(&x->block[i].src_diff[0],
157            &x->block[i].coeff[0], 32);
158    }
159
160    /* build dc block from 16 y dc values */
161    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
162        build_dcblock(x);
163
164    for (i = 16; i < 24; i += 2)
165    {
166        x->short_fdct8x4(&x->block[i].src_diff[0],
167            &x->block[i].coeff[0], 16);
168    }
169
170    /* do 2nd order transform on the dc block */
171    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
172        x->short_walsh4x4(&x->block[24].src_diff[0],
173        &x->block[24].coeff[0], 8);
174
175}
176
177
178static void transform_mby(MACROBLOCK *x)
179{
180    int i;
181
182    for (i = 0; i < 16; i += 2)
183    {
184        x->short_fdct8x4(&x->block[i].src_diff[0],
185            &x->block[i].coeff[0], 32);
186    }
187
188    /* build dc block from 16 y dc values */
189    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
190    {
191        build_dcblock(x);
192        x->short_walsh4x4(&x->block[24].src_diff[0],
193            &x->block[24].coeff[0], 8);
194    }
195}
196
197
198
199#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
200
201typedef struct vp8_token_state vp8_token_state;
202
203struct vp8_token_state{
204  int           rate;
205  int           error;
206  signed char   next;
207  signed char   token;
208  short         qc;
209};
210
211/* TODO: experiments to find optimal multiple numbers */
212#define Y1_RD_MULT 4
213#define UV_RD_MULT 2
214#define Y2_RD_MULT 16
215
216static const int plane_rd_mult[4]=
217{
218    Y1_RD_MULT,
219    Y2_RD_MULT,
220    UV_RD_MULT,
221    Y1_RD_MULT
222};
223
224static void optimize_b(MACROBLOCK *mb, int ib, int type,
225                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
226{
227    BLOCK *b;
228    BLOCKD *d;
229    vp8_token_state tokens[17][2];
230    unsigned best_mask[2];
231    const short *dequant_ptr;
232    const short *coeff_ptr;
233    short *qcoeff_ptr;
234    short *dqcoeff_ptr;
235    int eob;
236    int i0;
237    int rc;
238    int x;
239    int sz = 0;
240    int next;
241    int rdmult;
242    int rddiv;
243    int final_eob;
244    int rd_cost0;
245    int rd_cost1;
246    int rate0;
247    int rate1;
248    int error0;
249    int error1;
250    int t0;
251    int t1;
252    int best;
253    int band;
254    int pt;
255    int i;
256    int err_mult = plane_rd_mult[type];
257
258    b = &mb->block[ib];
259    d = &mb->e_mbd.block[ib];
260
261    /* Enable this to test the effect of RDO as a replacement for the dynamic
262     *  zero bin instead of an augmentation of it.
263     */
264#if 0
265    vp8_strict_quantize_b(b, d);
266#endif
267
268    dequant_ptr = d->dequant;
269    coeff_ptr = b->coeff;
270    qcoeff_ptr = d->qcoeff;
271    dqcoeff_ptr = d->dqcoeff;
272    i0 = !type;
273    eob = *d->eob;
274
275    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
276    rdmult = mb->rdmult * err_mult;
277    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
278        rdmult = (rdmult * 9)>>4;
279
280    rddiv = mb->rddiv;
281    best_mask[0] = best_mask[1] = 0;
282    /* Initialize the sentinel node of the trellis. */
283    tokens[eob][0].rate = 0;
284    tokens[eob][0].error = 0;
285    tokens[eob][0].next = 16;
286    tokens[eob][0].token = DCT_EOB_TOKEN;
287    tokens[eob][0].qc = 0;
288    *(tokens[eob] + 1) = *(tokens[eob] + 0);
289    next = eob;
290    for (i = eob; i-- > i0;)
291    {
292        int base_bits;
293        int d2;
294        int dx;
295
296        rc = vp8_default_zig_zag1d[i];
297        x = qcoeff_ptr[rc];
298        /* Only add a trellis state for non-zero coefficients. */
299        if (x)
300        {
301            int shortcut=0;
302            error0 = tokens[next][0].error;
303            error1 = tokens[next][1].error;
304            /* Evaluate the first possibility for this state. */
305            rate0 = tokens[next][0].rate;
306            rate1 = tokens[next][1].rate;
307            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
308            /* Consider both possible successor states. */
309            if (next < 16)
310            {
311                band = vp8_coef_bands[i + 1];
312                pt = vp8_prev_token_class[t0];
313                rate0 +=
314                    mb->token_costs[type][band][pt][tokens[next][0].token];
315                rate1 +=
316                    mb->token_costs[type][band][pt][tokens[next][1].token];
317            }
318            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
319            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
320            if (rd_cost0 == rd_cost1)
321            {
322                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
323                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
324            }
325            /* And pick the best. */
326            best = rd_cost1 < rd_cost0;
327            base_bits = *(vp8_dct_value_cost_ptr + x);
328            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
329            d2 = dx*dx;
330            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
331            tokens[i][0].error = d2 + (best ? error1 : error0);
332            tokens[i][0].next = next;
333            tokens[i][0].token = t0;
334            tokens[i][0].qc = x;
335            best_mask[0] |= best << i;
336            /* Evaluate the second possibility for this state. */
337            rate0 = tokens[next][0].rate;
338            rate1 = tokens[next][1].rate;
339
340            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
341               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
342                shortcut = 1;
343            else
344                shortcut = 0;
345
346            if(shortcut)
347            {
348                sz = -(x < 0);
349                x -= 2*sz + 1;
350            }
351
352            /* Consider both possible successor states. */
353            if (!x)
354            {
355                /* If we reduced this coefficient to zero, check to see if
356                 *  we need to move the EOB back here.
357                 */
358                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
359                    DCT_EOB_TOKEN : ZERO_TOKEN;
360                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
361                    DCT_EOB_TOKEN : ZERO_TOKEN;
362            }
363            else
364            {
365                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
366            }
367            if (next < 16)
368            {
369                band = vp8_coef_bands[i + 1];
370                if(t0!=DCT_EOB_TOKEN)
371                {
372                    pt = vp8_prev_token_class[t0];
373                    rate0 += mb->token_costs[type][band][pt][
374                        tokens[next][0].token];
375                }
376                if(t1!=DCT_EOB_TOKEN)
377                {
378                    pt = vp8_prev_token_class[t1];
379                    rate1 += mb->token_costs[type][band][pt][
380                        tokens[next][1].token];
381                }
382            }
383
384            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
385            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
386            if (rd_cost0 == rd_cost1)
387            {
388                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
389                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
390            }
391            /* And pick the best. */
392            best = rd_cost1 < rd_cost0;
393            base_bits = *(vp8_dct_value_cost_ptr + x);
394
395            if(shortcut)
396            {
397                dx -= (dequant_ptr[rc] + sz) ^ sz;
398                d2 = dx*dx;
399            }
400            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
401            tokens[i][1].error = d2 + (best ? error1 : error0);
402            tokens[i][1].next = next;
403            tokens[i][1].token =best?t1:t0;
404            tokens[i][1].qc = x;
405            best_mask[1] |= best << i;
406            /* Finally, make this the new head of the trellis. */
407            next = i;
408        }
409        /* There's no choice to make for a zero coefficient, so we don't
410         *  add a new trellis node, but we do need to update the costs.
411         */
412        else
413        {
414            band = vp8_coef_bands[i + 1];
415            t0 = tokens[next][0].token;
416            t1 = tokens[next][1].token;
417            /* Update the cost of each path if we're past the EOB token. */
418            if (t0 != DCT_EOB_TOKEN)
419            {
420                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
421                tokens[next][0].token = ZERO_TOKEN;
422            }
423            if (t1 != DCT_EOB_TOKEN)
424            {
425                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
426                tokens[next][1].token = ZERO_TOKEN;
427            }
428            /* Don't update next, because we didn't add a new node. */
429        }
430    }
431
432    /* Now pick the best path through the whole trellis. */
433    band = vp8_coef_bands[i + 1];
434    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
435    rate0 = tokens[next][0].rate;
436    rate1 = tokens[next][1].rate;
437    error0 = tokens[next][0].error;
438    error1 = tokens[next][1].error;
439    t0 = tokens[next][0].token;
440    t1 = tokens[next][1].token;
441    rate0 += mb->token_costs[type][band][pt][t0];
442    rate1 += mb->token_costs[type][band][pt][t1];
443    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
444    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
445    if (rd_cost0 == rd_cost1)
446    {
447        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
448        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
449    }
450    best = rd_cost1 < rd_cost0;
451    final_eob = i0 - 1;
452    for (i = next; i < eob; i = next)
453    {
454        x = tokens[i][best].qc;
455        if (x)
456            final_eob = i;
457        rc = vp8_default_zig_zag1d[i];
458        qcoeff_ptr[rc] = x;
459        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
460        next = tokens[i][best].next;
461        best = (best_mask[best] >> i) & 1;
462    }
463    final_eob++;
464
465    *a = *l = (final_eob != !type);
466    *d->eob = (char)final_eob;
467}
468static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
469                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
470{
471    int sum=0;
472    int i;
473    BLOCKD *bd = &x->block[24];
474
475    if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
476        return;
477
478    for(i=0;i<(*bd->eob);i++)
479    {
480        int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
481        sum+= (coef>=0)?coef:-coef;
482        if(sum>=35)
483            return;
484    }
485    /**************************************************************************
486    our inverse hadamard transform effectively is weighted sum of all 16 inputs
487    with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
488    dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
489    output after inverse wht and idct will be all zero. A sum of absolute value
490    smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
491    fall between -35 and +35.
492    **************************************************************************/
493    if(sum < 35)
494    {
495        for(i=0;i<(*bd->eob);i++)
496        {
497            int rc = vp8_default_zig_zag1d[i];
498            bd->qcoeff[rc]=0;
499            bd->dqcoeff[rc]=0;
500        }
501        *bd->eob = 0;
502        *a = *l = (*bd->eob != !type);
503    }
504}
505
506static void optimize_mb(MACROBLOCK *x)
507{
508    int b;
509    int type;
510    int has_2nd_order;
511
512    ENTROPY_CONTEXT_PLANES t_above, t_left;
513    ENTROPY_CONTEXT *ta;
514    ENTROPY_CONTEXT *tl;
515
516    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
517    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
518
519    ta = (ENTROPY_CONTEXT *)&t_above;
520    tl = (ENTROPY_CONTEXT *)&t_left;
521
522    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
523        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
524    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
525
526    for (b = 0; b < 16; b++)
527    {
528        optimize_b(x, b, type,
529            ta + vp8_block2above[b], tl + vp8_block2left[b]);
530    }
531
532    for (b = 16; b < 24; b++)
533    {
534        optimize_b(x, b, PLANE_TYPE_UV,
535            ta + vp8_block2above[b], tl + vp8_block2left[b]);
536    }
537
538    if (has_2nd_order)
539    {
540        b=24;
541        optimize_b(x, b, PLANE_TYPE_Y2,
542            ta + vp8_block2above[b], tl + vp8_block2left[b]);
543        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
544            ta + vp8_block2above[b], tl + vp8_block2left[b]);
545    }
546}
547
548
549void vp8_optimize_mby(MACROBLOCK *x)
550{
551    int b;
552    int type;
553    int has_2nd_order;
554
555    ENTROPY_CONTEXT_PLANES t_above, t_left;
556    ENTROPY_CONTEXT *ta;
557    ENTROPY_CONTEXT *tl;
558
559    if (!x->e_mbd.above_context)
560        return;
561
562    if (!x->e_mbd.left_context)
563        return;
564
565    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
566    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
567
568    ta = (ENTROPY_CONTEXT *)&t_above;
569    tl = (ENTROPY_CONTEXT *)&t_left;
570
571    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
572        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
573    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
574
575    for (b = 0; b < 16; b++)
576    {
577        optimize_b(x, b, type,
578            ta + vp8_block2above[b], tl + vp8_block2left[b]);
579    }
580
581
582    if (has_2nd_order)
583    {
584        b=24;
585        optimize_b(x, b, PLANE_TYPE_Y2,
586            ta + vp8_block2above[b], tl + vp8_block2left[b]);
587        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
588            ta + vp8_block2above[b], tl + vp8_block2left[b]);
589    }
590}
591
592void vp8_optimize_mbuv(MACROBLOCK *x)
593{
594    int b;
595    ENTROPY_CONTEXT_PLANES t_above, t_left;
596    ENTROPY_CONTEXT *ta;
597    ENTROPY_CONTEXT *tl;
598
599    if (!x->e_mbd.above_context)
600        return;
601
602    if (!x->e_mbd.left_context)
603        return;
604
605    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
606    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
607
608    ta = (ENTROPY_CONTEXT *)&t_above;
609    tl = (ENTROPY_CONTEXT *)&t_left;
610
611    for (b = 16; b < 24; b++)
612    {
613        optimize_b(x, b, PLANE_TYPE_UV,
614            ta + vp8_block2above[b], tl + vp8_block2left[b]);
615    }
616}
617
618void vp8_encode_inter16x16(MACROBLOCK *x)
619{
620    vp8_build_inter_predictors_mb(&x->e_mbd);
621
622    vp8_subtract_mb(x);
623
624    transform_mb(x);
625
626    vp8_quantize_mb(x);
627
628    if (x->optimize)
629        optimize_mb(x);
630}
631
632/* this funciton is used by first pass only */
633void vp8_encode_inter16x16y(MACROBLOCK *x)
634{
635    BLOCK *b = &x->block[0];
636
637    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
638                                        x->e_mbd.dst.y_stride);
639
640    vp8_subtract_mby(x->src_diff, *(b->base_src),
641        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
642
643    transform_mby(x);
644
645    vp8_quantize_mby(x);
646
647    vp8_inverse_transform_mby(&x->e_mbd);
648}
649