dec/src/idct.cpp

/* ------------------------------------------------------------------
 * Copyright (C) 1998-2009 PacketVideo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 * -------------------------------------------------------------------
 */
/*
------------------------------------------------------------------------------
 MODULE DESCRIPTION

 This file contains the functions that transform an 8r8 image block from
 dequantized DCT coefficients to spatial domain pirel values by calculating
 inverse discrete cosine transform (IDCT).

------------------------------------------------------------------------------
*/
/*----------------------------------------------------------------------------
; INCLUDES
----------------------------------------------------------------------------*/
#include "mp4dec_lib.h"
#include "idct.h"
#include "motion_comp.h"
#ifndef FAST_IDCT

/*
------------------------------------------------------------------------------
 FUNCTION NAME: idct
------------------------------------------------------------------------------
 INPUT AND OUTPUT DEFINITIONS FOR idct

 Inputs:
    blk = pointer to the buffer containing the dequantized DCT
          coefficients of type int for an 8r8 image block;
          values range from (-2048, 2047) which defined as standard.

 Local Stores/Buffers/Pointers Needed:
    None

 Global Stores/Buffers/Pointers Needed:
    None

 Outputs:
    None

 Pointers and Buffers Modified:
    blk points to the found IDCT values for an 8r8 image block.

 Local Stores Modified:
    None

 Global Stores Modified:
    None

------------------------------------------------------------------------------
 FUNCTION DESCRIPTION FOR idct

 This function transforms an 8r8 image block from dequantized DCT coefficients
 (F(u,v)) to spatial domain pirel values (f(r,y)) by performing the two
 dimensional inverse discrete cosine transform (IDCT).

         _7_ _7_      C(u) C(v)
    f(r,y) = \   \  F(u,v)---- ----cos[(2r+1)*u*pi/16]cos[(2y+1)*v*pi/16]
         /__ /__    2    2
         u=0 v=0

    where   C(i) = 1/sqrt(2)    if i=0
        C(i) = 1        otherwise

 2-D IDCT can be separated as horizontal(row-wise) and vertical(column-wise)
 1-D IDCTs. Therefore, 2-D IDCT values are found by the following two steps:
 1. Find horizontal 1-D IDCT values for each row from 8r8 dequantized DCT
    coefficients by row IDCT operation.

          _7_        C(u)
    g(r,v) =  \   F(u,v) ---- cos[(2r+1)*u*pi/16]
          /__         2
          u=0

 2. Find vertical 1-D IDCT values for each column from the results of 1
    by column IDCT operation.

              _7_        C(v)
    f(r,y) =  \   g(r,v) ---- cos[(2y+1)*v*pi/16]
          /__         2
          v=0

------------------------------------------------------------------------------
 REQUIREMENTS FOR idct

 None

------------------------------------------------------------------------------
*/
/*  REFERENCES FOR idct */
/* idct.c, inverse fast discrete cosine transform
 inverse two dimensional DCT, Chen-Wang algorithm
 (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)
 32-bit integer arithmetic (8 bit coefficients)
 11 mults, 29 adds per DCT
 sE, 18.8.91

 coefficients ertended to 12 bit for IEEE1180-1990
 compliance                           sE,  2.1.94
*/


/*----------------------------------------------------------------------------
; Function Code FOR idct
----------------------------------------------------------------------------*/
void idct_intra(
    int *blk, uint8 *comp, int width
)
{
    /*----------------------------------------------------------------------------
    ; Define all local variables
    ----------------------------------------------------------------------------*/
    int i;
    int32   tmpBLK[64];
    int32   *tmpBLK32 = &tmpBLK[0];
    int32   r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
    int32   a;
    int offset = width - 8;
    /*----------------------------------------------------------------------------
    ; Function body here
    ----------------------------------------------------------------------------*/
    /* two dimensional inverse discrete cosine transform */


    /* column (vertical) IDCT */
    for (i = B_SIZE - 1; i >= 0; i--)
    {
        /* initialize butterfly nodes at first stage */

        r1 = blk[B_SIZE * 4 + i] << 11;
        /* since row IDCT results have net left shift by 3 */
        /* this left shift by 8 gives net left shift by 11 */
        /* in order to maintain the same scale as that of  */
        /* coefficients Wi */

        r2 = blk[B_SIZE * 6 + i];
        r3 = blk[B_SIZE * 2 + i];
        r4 = blk[B_SIZE * 1 + i];
        r5 = blk[B_SIZE * 7 + i];
        r6 = blk[B_SIZE * 5 + i];
        r7 = blk[B_SIZE * 3 + i];

        if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
        {
            /* shortcut */
            /* execute if values of g(r,1) to g(r,7) in a column*/
            /* are all zeros */

            /* make output of IDCT >>3 or scaled by 1/8 and */
            /* with the proper rounding */
            a = (blk[B_SIZE * 0 + i]) << 3;
            tmpBLK32[B_SIZE * 0 + i] = a;
            tmpBLK32[B_SIZE * 1 + i] = a;
            tmpBLK32[B_SIZE * 2 + i] = a;
            tmpBLK32[B_SIZE * 3 + i] = a;
            tmpBLK32[B_SIZE * 4 + i] = a;
            tmpBLK32[B_SIZE * 5 + i] = a;
            tmpBLK32[B_SIZE * 6 + i] = a;
            tmpBLK32[B_SIZE * 7 + i] = a;
        }
        else
        {
            r0 = (blk[8 * 0 + i] << 11) + 128;

            /* first stage */

            r8 = W7 * (r4 + r5);
            r4 = (r8 + (W1 - W7) * r4);
            /* Multiplication with Wi increases the net left */
            /* shift from 11 to 14,we have to shift back by 3*/
            r5 = (r8 - (W1 + W7) * r5);
            r8 = W3 * (r6 + r7);
            r6 = (r8 - (W3 - W5) * r6);
            r7 = (r8 - (W3 + W5) * r7);

            /* second stage */
            r8 = r0 + r1;
            r0 -= r1;

            r1 = W6 * (r3 + r2);
            r2 = (r1 - (W2 + W6) * r2);
            r3 = (r1 + (W2 - W6) * r3);

            r1 = r4 + r6;
            r4 -= r6;
            r6 = r5 + r7;
            r5 -= r7;

            /* third stage */
            r7 = r8 + r3;
            r8 -= r3;
            r3 = r0 + r2;
            r0 -= r2;
            r2 = (181 * (r4 + r5) + 128) >> 8;  /* rounding */
            r4 = (181 * (r4 - r5) + 128) >> 8;

            /* fourth stage */
            /* net shift of IDCT is >>3 after the following */
            /* shift operation, it makes output of 2-D IDCT */
            /* scaled by 1/8, that is scaled twice by       */
            /* 1/(2*sqrt(2)) for row IDCT and column IDCT.  */
            /* see detail analysis in design doc.           */
            tmpBLK32[0 + i] = (r7 + r1) >> 8;
            tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
            tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
            tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
            tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
            tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
            tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
            tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
        }
    }
    /* row (horizontal) IDCT */
    for (i = 0 ; i < B_SIZE; i++)
    {
        /* initialize butterfly nodes at the first stage */

        r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
        /* r1 left shift by 11 is to maintain the same  */
        /* scale as that of coefficients (W1,...W7) */
        /* since blk[4] won't multiply with Wi.     */
        /* see detail diagram in design document.   */

        r2 = tmpBLK32[6+(i<<3)];
        r3 = tmpBLK32[2+(i<<3)];
        r4 = tmpBLK32[1+(i<<3)];
        r5 = tmpBLK32[7+(i<<3)];
        r6 = tmpBLK32[5+(i<<3)];
        r7 = tmpBLK32[3+(i<<3)];

        if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
        {
            /* shortcut */
            /* execute if values of F(1,v) to F(7,v) in a row*/
            /* are all zeros */

            /* output of row IDCT scaled by 8 */
            a = (((int32)tmpBLK32[0+(i<<3)] + 32) >> 6);
            CLIP_RESULT(a)
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;
            *comp++ = a;

            comp += offset;
        }

        else
        {
            /* for proper rounding in the fourth stage */
            r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

            /* first stage */

            r8 = W7 * (r4 + r5) + 4;
            r4 = (r8 + (W1 - W7) * r4) >> 3;
            r5 = (r8 - (W1 + W7) * r5) >> 3;

            r8 = W3 * (r6 + r7) + 4;
            r6 = (r8 - (W3 - W5) * r6) >> 3;
            r7 = (r8 - (W3 + W5) * r7) >> 3;

            /* second stage */
            r8 = r0 + r1;
            r0 -= r1;

            r1 = W6 * (r3 + r2) + 4;
            r2 = (r1 - (W2 + W6) * r2) >> 3;
            r3 = (r1 + (W2 - W6) * r3) >> 3;

            r1 = r4 + r6;
            r4 -= r6;
            r6 = r5 + r7;
            r5 -= r7;

            /* third stage */
            r7 = r8 + r3;
            r8 -= r3;
            r3 = r0 + r2;
            r0 -= r2;
            r2 = (181 * (r4 + r5) + 128) >> 8;    /* rounding */
            r4 = (181 * (r4 - r5) + 128) >> 8;

            /* fourth stage */
            /* net shift of this function is <<3 after the    */
            /* following shift operation, it makes output of  */
            /* row IDCT scaled by 8 to retain 3 bits precision*/
            a = ((r7 + r1) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r3 + r2) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r0 + r4) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r8 + r6) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r8 - r6) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r0 - r4) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r3 - r2) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;
            a = ((r7 - r1) >> 14);
            CLIP_RESULT(a)
            *comp++ = a;

            comp += offset;
        }
    }


    /*----------------------------------------------------------------------------
    ; Return nothing or data or data pointer
    ----------------------------------------------------------------------------*/
    return;
}

void idct(
    int *blk, uint8 *pred, uint8 *dst, int width)
{
    /*----------------------------------------------------------------------------
    ; Define all local variables
    ----------------------------------------------------------------------------*/
    int i;
    int32   tmpBLK[64];
    int32   *tmpBLK32 = &tmpBLK[0];
    int32   r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
    int32   a;
    int res;

    /*----------------------------------------------------------------------------
    ; Function body here
    ----------------------------------------------------------------------------*/
    /* two dimensional inverse discrete cosine transform */


    /* column (vertical) IDCT */
    for (i = B_SIZE - 1; i >= 0; i--)
    {
        /* initialize butterfly nodes at first stage */

        r1 = blk[B_SIZE * 4 + i] << 11;
        /* since row IDCT results have net left shift by 3 */
        /* this left shift by 8 gives net left shift by 11 */
        /* in order to maintain the same scale as that of  */
        /* coefficients Wi */

        r2 = blk[B_SIZE * 6 + i];
        r3 = blk[B_SIZE * 2 + i];
        r4 = blk[B_SIZE * 1 + i];
        r5 = blk[B_SIZE * 7 + i];
        r6 = blk[B_SIZE * 5 + i];
        r7 = blk[B_SIZE * 3 + i];

        if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
        {
            /* shortcut */
            /* execute if values of g(r,1) to g(r,7) in a column*/
            /* are all zeros */

            /* make output of IDCT >>3 or scaled by 1/8 and */
            /* with the proper rounding */
            a = (blk[B_SIZE * 0 + i]) << 3;
            tmpBLK32[B_SIZE * 0 + i] = a;
            tmpBLK32[B_SIZE * 1 + i] = a;
            tmpBLK32[B_SIZE * 2 + i] = a;
            tmpBLK32[B_SIZE * 3 + i] = a;
            tmpBLK32[B_SIZE * 4 + i] = a;
            tmpBLK32[B_SIZE * 5 + i] = a;
            tmpBLK32[B_SIZE * 6 + i] = a;
            tmpBLK32[B_SIZE * 7 + i] = a;
        }
        else
        {
            r0 = (blk[8 * 0 + i] << 11) + 128;

            /* first stage */

            r8 = W7 * (r4 + r5);
            r4 = (r8 + (W1 - W7) * r4);
            /* Multiplication with Wi increases the net left */
            /* shift from 11 to 14,we have to shift back by 3*/
            r5 = (r8 - (W1 + W7) * r5);
            r8 = W3 * (r6 + r7);
            r6 = (r8 - (W3 - W5) * r6);
            r7 = (r8 - (W3 + W5) * r7);

            /* second stage */
            r8 = r0 + r1;
            r0 -= r1;

            r1 = W6 * (r3 + r2);
            r2 = (r1 - (W2 + W6) * r2);
            r3 = (r1 + (W2 - W6) * r3);

            r1 = r4 + r6;
            r4 -= r6;
            r6 = r5 + r7;
            r5 -= r7;

            /* third stage */
            r7 = r8 + r3;
            r8 -= r3;
            r3 = r0 + r2;
            r0 -= r2;
            r2 = (181 * (r4 + r5) + 128) >> 8;  /* rounding */
            r4 = (181 * (r4 - r5) + 128) >> 8;

            /* fourth stage */
            /* net shift of IDCT is >>3 after the following */
            /* shift operation, it makes output of 2-D IDCT */
            /* scaled by 1/8, that is scaled twice by       */
            /* 1/(2*sqrt(2)) for row IDCT and column IDCT.  */
            /* see detail analysis in design doc.           */
            tmpBLK32[0 + i] = (r7 + r1) >> 8;
            tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
            tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
            tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
            tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
            tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
            tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
            tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
        }
    }
    /* row (horizontal) IDCT */
    for (i = B_SIZE - 1; i >= 0; i--)
    {
        /* initialize butterfly nodes at the first stage */

        r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
        /* r1 left shift by 11 is to maintain the same  */
        /* scale as that of coefficients (W1,...W7) */
        /* since blk[4] won't multiply with Wi.     */
        /* see detail diagram in design document.   */

        r2 = tmpBLK32[6+(i<<3)];
        r3 = tmpBLK32[2+(i<<3)];
        r4 = tmpBLK32[1+(i<<3)];
        r5 = tmpBLK32[7+(i<<3)];
        r6 = tmpBLK32[5+(i<<3)];
        r7 = tmpBLK32[3+(i<<3)];

        if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
        {
            /* shortcut */
            /* execute if values of F(1,v) to F(7,v) in a row*/
            /* are all zeros */

            /* output of row IDCT scaled by 8 */
            a = (tmpBLK32[0+(i<<3)] + 32) >> 6;
            blk[0+(i<<3)] = a;
            blk[1+(i<<3)] = a;
            blk[2+(i<<3)] = a;
            blk[3+(i<<3)] = a;
            blk[4+(i<<3)] = a;
            blk[5+(i<<3)] = a;
            blk[6+(i<<3)] = a;
            blk[7+(i<<3)] = a;

        }

        else
        {
            /* for proper rounding in the fourth stage */
            r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

            /* first stage */

            r8 = W7 * (r4 + r5) + 4;
            r4 = (r8 + (W1 - W7) * r4) >> 3;
            r5 = (r8 - (W1 + W7) * r5) >> 3;

            r8 = W3 * (r6 + r7) + 4;
            r6 = (r8 - (W3 - W5) * r6) >> 3;
            r7 = (r8 - (W3 + W5) * r7) >> 3;

            /* second stage */
            r8 = r0 + r1;
            r0 -= r1;

            r1 = W6 * (r3 + r2) + 4;
            r2 = (r1 - (W2 + W6) * r2) >> 3;
            r3 = (r1 + (W2 - W6) * r3) >> 3;

            r1 = r4 + r6;
            r4 -= r6;
            r6 = r5 + r7;
            r5 -= r7;

            /* third stage */
            r7 = r8 + r3;
            r8 -= r3;
            r3 = r0 + r2;
            r0 -= r2;
            r2 = (181 * (r4 + r5) + 128) >> 8;    /* rounding */
            r4 = (181 * (r4 - r5) + 128) >> 8;

            /* fourth stage */
            /* net shift of this function is <<3 after the    */
            /* following shift operation, it makes output of  */
            /* row IDCT scaled by 8 to retain 3 bits precision*/
            blk[0+(i<<3)] = (r7 + r1) >> 14;
            blk[1+(i<<3)] = (r3 + r2) >> 14;
            blk[2+(i<<3)] = (r0 + r4) >> 14;
            blk[3+(i<<3)] = (r8 + r6) >> 14;
            blk[4+(i<<3)] = (r8 - r6) >> 14;
            blk[5+(i<<3)] = (r0 - r4) >> 14;
            blk[6+(i<<3)] = (r3 - r2) >> 14;
            blk[7+(i<<3)] = (r7 - r1) >> 14;
        }
        /*  add with prediction ,  08/03/05 */
        res = (*pred++ + block[0+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[1+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[2+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[3+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[4+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[5+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[6+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;
        res = (*pred++ + block[7+(i<<3)]);
        CLIP_RESULT(res);
        *dst++ = res;

        pred += 8;
        dst += (width - 8);
    }


    /*----------------------------------------------------------------------------
    ; Return nothing or data or data pointer
    ----------------------------------------------------------------------------*/
    return;
}

#endif
/*----------------------------------------------------------------------------
; End Function: idct
----------------------------------------------------------------------------*/