15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h"
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/inv_txfm_dspr2.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/txfm_common.h"
145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
167bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step1_28, step1_29, step1_30, step1_31;
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step2_28, step2_29, step2_30, step2_31;
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int step3_29, step3_30, step3_31;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int temp0, temp1, temp2, temp3;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int load1, load2, load3, load4;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int result1, result2;
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int i;
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dest_pix, *dest_pix1;
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int const_2_power_13 = 8192;
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* prefetch vpx_ff_cropTbl */
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(vpx_ff_cropTbl);
417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  prefetch_load(vpx_ff_cropTbl + 32);
427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  prefetch_load(vpx_ff_cropTbl + 64);
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  prefetch_load(vpx_ff_cropTbl + 96);
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(vpx_ff_cropTbl + 128);
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(vpx_ff_cropTbl + 160);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(vpx_ff_cropTbl + 192);
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(vpx_ff_cropTbl + 224);
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 32; ++i) {
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest_pix = dest + i;
517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dest_pix1 = dest + i + 31 * stride;
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             2(%[input])                     \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             62(%[input])                    \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             34(%[input])                    \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             30(%[input])                    \n\t"
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_17],          $ac1,           31              \n\t"
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_30],          $ac3,           31              \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
1057bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
1067bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
1077bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
1097bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             18(%[input])                    \n\t"
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             46(%[input])                    \n\t"
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             50(%[input])                    \n\t"
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             14(%[input])                    \n\t"
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_18],          $ac1,           31              \n\t"
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_29],          $ac3,           31              \n\t"
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1637bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
1687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
1697bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
1707bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
1717bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
1727bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1737bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             10(%[input])                    \n\t"
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             54(%[input])                    \n\t"
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             42(%[input])                    \n\t"
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             22(%[input])                    \n\t"
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_21],          $ac1,           31              \n\t"
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_26],          $ac3,           31              \n\t"
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2237bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
2247bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
2257bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
2267bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
2277bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
2287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
2297bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
2317bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
2327bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             26(%[input])                    \n\t"
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             38(%[input])                    \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             58(%[input])                    \n\t"
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],              6(%[input])                    \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_22],          $ac1,           31              \n\t"
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_25],          $ac3,           31              \n\t"
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
2807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
2817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
2827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
2837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
2847bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
2867bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
2877bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
2887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],              4(%[input])                    \n\t"
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             60(%[input])                    \n\t"
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             36(%[input])                    \n\t"
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             28(%[input])                    \n\t"
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step2_9],           $ac1,           31              \n\t"
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step2_14],          $ac3,           31              \n\t"
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
3367bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
3377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
3387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
3397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_15] "=&r"(step2_15)
3407bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
3417bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
3427bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
3437bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
3447bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             20(%[input])                    \n\t"
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             44(%[input])                    \n\t"
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             52(%[input])                    \n\t"
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             12(%[input])                    \n\t"
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step2_10],          $ac1,           31              \n\t"
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step2_13],          $ac3,           31              \n\t"
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
3927bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
3937bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
3947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
3957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
3967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
3977bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
3987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
3997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
4007bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step3_10],          $ac0,           31              \n\t"
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step3_13],          $ac1,           31              \n\t"
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step3_11],          $ac2,           31              \n\t"
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step3_12],          $ac3,           31              \n\t"
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
4377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
4387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
4397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
4407bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_15] "=&r"(step3_15)
4417bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
4427bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
4437bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
4447bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
4457bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
4467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
4507bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
4517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac1                            \n\t"
4527bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
4537bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
4547bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
4557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
4587bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
4597bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_18],          $ac0,           31              \n\t"
4607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
4617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
4627bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_29],          $ac1,           31              \n\t"
4637bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4647bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
4657bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
4667bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
4677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
4687bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
4697bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
4707bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64));
4717bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
4757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
4767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac1                            \n\t"
4777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
4787bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
4797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
4807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4827bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
4837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
4847bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_19],          $ac0,           31              \n\t"
4857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
4867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
4877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_28],          $ac1,           31              \n\t"
4887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
4927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
4937bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
4947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
4957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64));
4967bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
4987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
4997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac0                            \n\t"
5007bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac1                            \n\t"
5027bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
5037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
5047bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
5057bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
5087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
5097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_27],          $ac0,           31              \n\t"
5107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_20],          $ac1,           31              \n\t"
5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5147bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
5157bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
5167bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
5177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
5187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
5197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
5207bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64));
5217bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
5277bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
5287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
5297bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
5317bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
5337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_21],          $ac0,           31              \n\t"
5357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
5367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
5377bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step3_26],          $ac1,           31              \n\t"
5387bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5397bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
5407bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
5417bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
5427bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
5437bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
5447bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_8_64] "r"(cospi_8_64));
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
5487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
5497bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
5507bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
5517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
5527bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
5537bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
5547bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
5557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
5567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
5587bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
5597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
5627bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
5637bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
5647bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
5677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
5687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
5697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
5717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
5727bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
5737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
5747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
5787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
5797bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
5847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             0(%[input])                     \n\t"
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             32(%[input])                    \n\t"
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             16(%[input])                    \n\t"
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             48(%[input])                    \n\t"
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[result1],           %[load1],       %[load2]        \n\t"
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[result2],           %[load1],       %[load2]        \n\t"
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac3,           31              \n\t"
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac1,           31              \n\t"
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [result1] "=&r"(result1),
6197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
6207bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_3] "=&r"(step1_3)
6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
6247bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
6257bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_16_64] "r"(cospi_16_64));
6267bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load1],             8(%[input])                     \n\t"
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load2],             56(%[input])                    \n\t"
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load3],             40(%[input])                    \n\t"
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lh       %[load4],             24(%[input])                    \n\t"
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp0],             $ac1,           31              \n\t"
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp3],             $ac3,           31              \n\t"
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac2                            \n\t"
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp1],             $ac2,           31              \n\t"
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[temp2],             $ac1,           31              \n\t"
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac1                            \n\t"
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac3                            \n\t"
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[load1],             %[load1],       %[temp1]        \n\t"
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[load2],             %[load2],       %[temp3]        \n\t"
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_5],           $ac1,           31              \n\t"
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp     %[step1_6],           $ac3,           31              \n\t"
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
6777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
6787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
6797bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
6807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_7] "=&r"(step1_7)
6817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
6827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
6837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
6847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [cospi_16_64] "r"(cospi_16_64));
6857bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
6877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
6887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
6897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
6907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
6917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
6927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
6937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
6957bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
6977bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
6987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
6997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
7007bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
7017bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
7027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
7037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // stage 7
7067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
7077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
7087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
7097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
7107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
7117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
7127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
7137bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
7147bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
7157bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
7177bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
7187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
7197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
7207bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
7217bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
7227bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
7237bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
7247bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
7267bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
7277bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
7287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
7297bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
7307bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
7317bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
7327bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
7337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
7347bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
7367bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
7377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
7387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
7397bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
7407bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
7417bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
7427bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
7437bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
7467bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
7477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
7487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
7527bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
7537bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac1                            \n\t"
7547bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
7557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac2                            \n\t"
7567bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
7577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac3                            \n\t"
7587bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
7607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
7617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
7627bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7647bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_20],          $ac0,           31              \n\t"
7657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_27],          $ac1,           31              \n\t"
7667bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_21],          $ac2,           31              \n\t"
7677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_26],          $ac3,           31              \n\t"
7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
7707bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
7717bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
7727bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_26] "=&r"(step1_26)
7737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
7747bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
7757bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
7807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
7817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi     $zero,                $ac0                            \n\t"
7857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
7867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac1                            \n\t"
7877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
7887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac2                            \n\t"
7897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
7907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "mthi     $zero,                $ac3                            \n\t"
7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
7937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
7947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
7957bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7977bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_22],          $ac0,           31              \n\t"
7987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_25],          $ac1,           31              \n\t"
7997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_23],          $ac2,           31              \n\t"
8007bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "extp     %[step1_24],          $ac3,           31              \n\t"
8017bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8027bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
8037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
8047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
8057bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_24] "=&r"(step1_24)
8067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
8077bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
8087bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
8097bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
8197bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
8267bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
8437bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
8467bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
8477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
8487bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
8497bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
8507bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
8517bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_31] "r"(step2_31));
8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8587bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
8637bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
8687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
8747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
8797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
8827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
8837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
8847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
8857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_15] "r"(step3_15));
8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
8967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
9037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
9085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
9115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
9137bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
9207bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9227bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
9237bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
9247bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
9257bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
9267bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
9277bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
9287bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_27] "r"(step1_27));
9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
9407bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
9457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
9517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
9567bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9587bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
9597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
9607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
9617bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
9627bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_15] "r"(step3_15));
9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
9695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
9705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
9737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
9807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
9875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
9885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
9907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
9925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
9935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
9977bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
10007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
10017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
10027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
10037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
10047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
10057bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_23] "r"(step1_23));
10065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
10085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
10095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
10105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
10115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
10135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
10155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
10165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
10177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
10227bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
10255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
10275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
10287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
10305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
10315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
10325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
10337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
10367bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
10377bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
10387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
10397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_15] "r"(step3_15));
10405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
10425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
10435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
10455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
10465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
10475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
10485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
10507bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
10515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
10525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
10535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
10545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
10555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
10565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
10577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
10585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
10605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
10615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp0],         %[temp0],           32              \n\t"
10625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp0],         %[temp0],           6               \n\t"
10635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
10645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
10655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
10665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
10677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
10685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
10695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addi     %[temp1],         %[temp1],           32              \n\t"
10705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sra      %[temp1],         %[temp1],           6               \n\t"
10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
10725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
10735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
10745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
10767bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
10777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
10787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
10797bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
10807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
10817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step2_19] "r"(step2_19));
10825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
10845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
10855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
10865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
10875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10887bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
10895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
10905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
10915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
10925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
10937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
10955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
10965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
10975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
10987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
10995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
11015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
11025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
11035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
11047bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
11055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
11065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
11075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
11085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
11117bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
11127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
11137bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
11147bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [step3_15] "r"(step3_15));
11155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    input += 32;
11175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
11185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
11195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif  // #if HAVE_DSPR2
1120