15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h"
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h"
165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_common.h"
175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_blockd.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_idct.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
22b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t   step_0, step_1, step_2, step_3;
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       Temp0, Temp1, Temp2, Temp3;
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int const_2_power_13 = 8192;
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       i;
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 4; i--; ) {
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_1 = (input[0] + input[2]) * cospi_16_64;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_0 = dct_const_round_shift(temp_1);
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_2 = (input[0] - input[2]) * cospi_16_64;
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_1 = dct_const_round_shift(temp_2);
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp0],             0(%[input])                     \n\t"
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp1],             4(%[input])                     \n\t"
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp0],             2(%[input])                     \n\t"
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp1],             6(%[input])                     \n\t"
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_0],            $ac0,           31              \n\t"
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_1],            $ac1,           31              \n\t"
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_2 = dct_const_round_shift(temp1);
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_2],            $ac0,           31              \n\t"
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_3 = dct_const_round_shift(temp2);
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_3],            $ac1,           31              \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[0]  = step_0 + step_3;
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[4]  = step_1 + step_2;
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[8]  = step_1 - step_2;
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[12] = step_0 - step_3;
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sh       %[Temp0],             0(%[output])                    \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sh       %[Temp1],             8(%[output])                    \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sh       %[Temp2],             16(%[output])                   \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sh       %[Temp3],             24(%[output])                   \n\t"
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [output] "+r" (output)
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [const_2_power_13] "r" (const_2_power_13),
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [cospi_24_64] "r" (cospi_24_64),
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [input] "r" (input)
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    input += 4;
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    output += 1;
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int dest_stride) {
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t   step_0, step_1, step_2, step_3;
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       Temp0, Temp1, Temp2, Temp3;
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int const_2_power_13 = 8192;
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       i;
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t   *dest_pix;
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t   *cm = vp9_ff_cropTbl;
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* prefetch vp9_ff_cropTbl */
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl);
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl +  32);
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl +  64);
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl +  96);
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl + 128);
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl + 160);
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl + 192);
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(vp9_ff_cropTbl + 224);
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 4; ++i) {
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest_pix = (dest + i);
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_1 = (input[0] + input[2]) * cospi_16_64;
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_0 = dct_const_round_shift(temp_1);
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_2 = (input[0] - input[2]) * cospi_16_64;
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_1 = dct_const_round_shift(temp_2);
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp0],             0(%[input])                     \n\t"
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp1],             4(%[input])                     \n\t"
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp0],             2(%[input])                     \n\t"
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[Temp1],             6(%[input])                     \n\t"
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_0],            $ac0,           31              \n\t"
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_1],            $ac1,           31              \n\t"
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_2 = dct_const_round_shift(temp1);
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_2],            $ac0,           31              \n\t"
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          step_3 = dct_const_round_shift(temp2);
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step_3],            $ac1,           31              \n\t"
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /*
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[0]  = step_0 + step_3;
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[4]  = step_1 + step_2;
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[8]  = step_1 - step_2;
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          output[12] = step_0 - step_3;
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        */
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[Temp0],             %[Temp0],       8               \n\t"
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[Temp0],             %[Temp0],       4               \n\t"
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[Temp0],             %[Temp0],       8               \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[Temp0],             %[Temp0],       4               \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[Temp0],             %[Temp0],       8               \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[Temp0],             %[Temp0],       4               \n\t"
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[Temp0],             %[Temp0],       8               \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[Temp0],             %[Temp0],       4               \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [dest_pix] "+r" (dest_pix)
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [const_2_power_13] "r" (const_2_power_13),
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [cospi_24_64] "r" (cospi_24_64),
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    input += 4;
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                              int dest_stride) {
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t *outptr = out;
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t pos = 45;
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* bit positon for extract from acc */
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    "wrdsp      %[pos],     1           \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    :
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    : [pos] "r" (pos)
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Rows
243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_idct4_rows_dspr2(input, outptr);
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Columns
246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             int dest_stride) {
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       a1, absa1;
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       r;
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t   out;
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int       t2, vector_a1, vector_a;
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t  pos = 45;
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t   input_dc = input[0];
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* bit positon for extract from acc */
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    "wrdsp      %[pos],     1           \n\t"
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    :
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    : [pos] "r" (pos)
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      "addi     %[out],     %[out],    8       \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      "sra      %[a1],      %[out],    4       \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [out] "+r" (out), [a1] "=r" (a1)
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      :
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (a1 < 0) {
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* use quad-byte
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang     * input and output memory are four byte aligned */
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "abs        %[absa1],     %[a1]         \n\t"
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "replv.qb   %[vector_a1], %[absa1]      \n\t"
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [a1] "r" (a1)
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (r = 4; r--;) {
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lw             %[t2],          0(%[dest])                      \n\t"
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sw             %[vector_a],    0(%[dest])                      \n\t"
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dest] "+&r" (dest)
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* use quad-byte
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang     * input and output memory are four byte aligned */
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "replv.qb       %[vector_a1],   %[a1]     \n\t"
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [vector_a1] "=r" (vector_a1)
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [a1] "r" (a1)
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (r = 4; r--;) {
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lw           %[t2],          0(%[dest])                        \n\t"
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sw           %[vector_a],    0(%[dest])                        \n\t"
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dest] "+&r" (dest)
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst4_dspr2(const int16_t *input, int16_t *output) {
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int s0, s1, s2, s3, s4, s5, s6, s7;
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x0, x1, x2, x3;
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x0 = input[0];
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x1 = input[1];
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x2 = input[2];
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x3 = input[3];
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (!(x0 | x1 | x2 | x3)) {
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    output[0] = output[1] = output[2] = output[3] = 0;
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    return;
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s0 = sinpi_1_9 * x0;
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s1 = sinpi_2_9 * x0;
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s2 = sinpi_3_9 * x1;
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s3 = sinpi_4_9 * x2;
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s4 = sinpi_1_9 * x2;
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s5 = sinpi_2_9 * x3;
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s6 = sinpi_4_9 * x3;
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s7 = x0 - x2 + x3;
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x0 = s0 + s3 + s5;
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x1 = s1 - s4 - s6;
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x2 = sinpi_3_9 * s7;
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  x3 = s2;
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s0 = x0 + x3;
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s1 = x1 + x3;
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s2 = x2;
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  s3 = x0 + x1 - x3;
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // 1-D transform scaling factor is sqrt(2).
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // + 1b (addition) = 29b.
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Hence the output bit depth is 15b.
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  output[0] = dct_const_round_shift(s0);
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  output[1] = dct_const_round_shift(s1);
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  output[2] = dct_const_round_shift(s2);
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  output[3] = dct_const_round_shift(s3);
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             int dest_stride, int tx_type) {
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i, j;
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t *outptr = out;
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t temp_in[4 * 4], temp_out[4];
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t pos = 45;
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* bit positon for extract from acc */
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    "wrdsp      %[pos],     1           \n\t"
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    :
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    : [pos] "r" (pos)
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  switch (tx_type) {
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case DCT_DCT:   // DCT in both horizontal and vertical
382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      vp9_idct4_rows_dspr2(input, outptr);
383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case ADST_DCT:  // ADST in vertical, DCT in horizontal
386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      vp9_idct4_rows_dspr2(input, outptr);
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      outptr = out;
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (i = 0; i < 4; ++i) {
391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        iadst4_dspr2(outptr, temp_out);
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (j = 0; j < 4; ++j)
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dest[j * dest_stride + i] =
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      + dest[j * dest_stride + i]);
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        outptr += 4;
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case DCT_ADST:  // DCT in vertical, ADST in horizontal
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (i = 0; i < 4; ++i) {
403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        iadst4_dspr2(input, outptr);
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input  += 4;
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        outptr += 4;
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (i = 0; i < 4; ++i) {
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (j = 0; j < 4; ++j) {
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_in[i * 4 + j] = out[j * 4 + i];
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case ADST_ADST:  // ADST in both directions
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (i = 0; i < 4; ++i) {
417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        iadst4_dspr2(input, outptr);
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input  += 4;
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        outptr += 4;
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (i = 0; i < 4; ++i) {
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (j = 0; j < 4; ++j)
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          temp_in[j] = out[j * 4 + i];
425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        iadst4_dspr2(temp_in, temp_out);
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (j = 0; j < 4; ++j)
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dest[j * dest_stride + i] =
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      + dest[j * dest_stride + i]);
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    default:
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif  // #if HAVE_DSPR2
439