15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* 25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * 45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * Use of this source code is governed by a BSD-style license 55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * that can be found in the LICENSE file in the root of the source 65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * tree. An additional intellectual property rights grant can be found 75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * in the file PATENTS. All contributing project authors may 85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * be found in the AUTHORS file in the root of the source tree. 95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */ 105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h> 125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h> 135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h" 155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h" 165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_common.h" 175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_blockd.h" 185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_idct.h" 195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2 22b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_rows_dspr2(const int16_t *input, int16_t *output, 23b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian uint32_t no_rows) { 245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int i; 255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step1_10, step1_11, step1_12, step1_13; 275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_0, step2_1, step2_2, step2_3; 285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_8, step2_9, step2_10, step2_11; 295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_12, step2_13, step2_14, step2_15; 305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int load1, load2, load3, load4, load5, load6, load7, load8; 315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int result1, result2, result3, result4; 325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int const_2_power_13 = 8192; 335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = no_rows; i--; ) { 355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch row */ 365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load((const uint8_t *)(input + 16)); 375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load1], 0(%[input]) \n\t" 405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load2], 16(%[input]) \n\t" 415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load3], 8(%[input]) \n\t" 425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load4], 24(%[input]) \n\t" 435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[result1], %[load1], %[load2] \n\t" 495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[result2], %[load1], %[load2] \n\t" 505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[result1], %[cospi_16_64] \n\t" 515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[result2], %[cospi_16_64] \n\t" 525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_0], $ac1, 31 \n\t" 535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_1], $ac2, 31 \n\t" 545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load3], %[cospi_24_64] \n\t" 585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load4], %[cospi_8_64] \n\t" 595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_2], $ac3, 31 \n\t" 605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load3], %[cospi_8_64] \n\t" 645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load4], %[cospi_24_64] \n\t" 655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_3], $ac1, 31 \n\t" 665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_0], %[step2_0], %[step2_3] \n\t" 685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_1], %[step2_1], %[step2_2] \n\t" 695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[step1_2], %[step2_1], %[step2_2] \n\t" 705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[step1_3], %[step2_0], %[step2_3] \n\t" 715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load1] "=&r" (load1), [load2] "=&r" (load2), 735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load3] "=&r" (load3), [load4] "=&r" (load4), 745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), 765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), 775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), 785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) 795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), 815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load5], 2(%[input]) \n\t" 865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load6], 30(%[input]) \n\t" 875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load7], 18(%[input]) \n\t" 885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load8], 14(%[input]) \n\t" 895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_30_64] \n\t" 965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load6], %[cospi_2_64] \n\t" 975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load7], %[cospi_14_64] \n\t" 1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load8], %[cospi_18_64] \n\t" 1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load7], %[cospi_18_64] \n\t" 1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load8], %[cospi_14_64] \n\t" 1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_2_64] \n\t" 1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load6], %[cospi_30_64] \n\t" 1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[result1], %[result2] \n\t" 1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[result4], %[result3] \n\t" 1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load6], %[cospi_24_64] \n\t" 1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load5], %[cospi_8_64] \n\t" 1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load5], %[cospi_24_64] \n\t" 1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_8_64] \n\t" 1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_9], $ac1, 31 \n\t" 1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_14], $ac3, 31 \n\t" 1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_8], %[result1], %[result2] \n\t" 1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_15], %[result4], %[result3] \n\t" 1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load7] "=&r" (load7), [load8] "=&r" (load8), 1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), 1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) 1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), 1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), 1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load1], 10(%[input]) \n\t" 1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load2], 22(%[input]) \n\t" 1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load3], 26(%[input]) \n\t" 1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load4], 6(%[input]) \n\t" 1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load1], %[cospi_22_64] \n\t" 1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load2], %[cospi_10_64] \n\t" 1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load3], %[cospi_6_64] \n\t" 1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load4], %[cospi_26_64] \n\t" 1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load1], %[cospi_10_64] \n\t" 1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load2], %[cospi_22_64] \n\t" 1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load3], %[cospi_26_64] \n\t" 1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load4], %[cospi_6_64] \n\t" 1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load1], %[result2], %[result1] \n\t" 1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load2], %[result4], %[result3] \n\t" 1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load1], %[cospi_24_64] \n\t" 1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load2], %[cospi_8_64] \n\t" 1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load2], %[cospi_24_64] \n\t" 1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load1], %[cospi_8_64] \n\t" 1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_10], $ac1, 31 \n\t" 1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_13], $ac3, 31 \n\t" 1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_11], %[result1], %[result2] \n\t" 1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_12], %[result4], %[result3] \n\t" 1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load1] "=&r" (load1), [load2] "=&r" (load2), 1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load3] "=&r" (load3), [load4] "=&r" (load4), 1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), 2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) 2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), 2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), 2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load5], 4(%[input]) \n\t" 2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load6], 28(%[input]) \n\t" 2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load7], 20(%[input]) \n\t" 2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load8], 12(%[input]) \n\t" 2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_28_64] \n\t" 2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load6], %[cospi_4_64] \n\t" 2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load7], %[cospi_12_64] \n\t" 2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load8], %[cospi_20_64] \n\t" 2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load7], %[cospi_20_64] \n\t" 2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load8], %[cospi_12_64] \n\t" 2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_4_64] \n\t" 2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load6], %[cospi_28_64] \n\t" 2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[result4], %[result3] \n\t" 2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[result1] \n\t" 2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[result2] \n\t" 2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[result1], %[result2] \n\t" 2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[result3] \n\t" 2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[result4] \n\t" 2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_16_64] \n\t" 2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_16_64] \n\t" 2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_5], $ac1, 31 \n\t" 2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_6], $ac3, 31 \n\t" 2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_4], %[result1], %[result2] \n\t" 2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_7], %[result4], %[result3] \n\t" 2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load7] "=&r" (load7), [load8] "=&r" (load8), 2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), 2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) 2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), 2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), 2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac0 \n\t" 2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac0 \n\t" 2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step2_14], %[step2_13] \n\t" 2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_9] \n\t" 2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_10] \n\t" 2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac0, %[load5], %[cospi_16_64] \n\t" 2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step2_14], %[step2_13] \n\t" 2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_10] \n\t" 2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_9] \n\t" 2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load6], %[cospi_16_64] \n\t" 2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step2_15], %[step2_12] \n\t" 2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_8] \n\t" 2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_11] \n\t" 2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_16_64] \n\t" 3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step2_15], %[step2_12] \n\t" 3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_11] \n\t" 3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_8] \n\t" 3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_16_64] \n\t" 3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_10], $ac0, 31 \n\t" 3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_13], $ac1, 31 \n\t" 3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_11], $ac2, 31 \n\t" 3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_12], $ac3, 31 \n\t" 3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), 3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) 3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), 3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), 3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), 3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), 3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), 3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_0], %[step1_7] \n\t" 3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_12] \n\t" 3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_15] \n\t" 3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_1], %[step1_6] \n\t" 3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_13] \n\t" 3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_14] \n\t" 3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 0(%[output]) \n\t" 3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 32(%[output]) \n\t" 3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_1], %[step1_6] \n\t" 3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_9] \n\t" 3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_10] \n\t" 3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_0], %[step1_7] \n\t" 3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_8] \n\t" 3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_11] \n\t" 3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 192(%[output]) \n\t" 3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 224(%[output]) \n\t" 3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_0], %[step1_7] \n\t" 3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_8] \n\t" 3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_11] \n\t" 3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_1], %[step1_6] \n\t" 3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_9] \n\t" 3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_10] \n\t" 3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 256(%[output]) \n\t" 3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 288(%[output]) \n\t" 3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_1], %[step1_6] \n\t" 3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_13] \n\t" 3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_14] \n\t" 3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_0], %[step1_7] \n\t" 3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_12] \n\t" 3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_15] \n\t" 3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 448(%[output]) \n\t" 3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 480(%[output]) \n\t" 3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6) 3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [output] "r" (output), 3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), 3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), 3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), 3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), 3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), 3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) 3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_2], %[step1_5] \n\t" 3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_13] \n\t" 3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_3], %[step1_4] \n\t" 3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_12] \n\t" 3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 64(%[output]) \n\t" 3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 96(%[output]) \n\t" 3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_3], %[step1_4] \n\t" 3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_11] \n\t" 3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_2], %[step1_5] \n\t" 3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_10] \n\t" 3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 128(%[output]) \n\t" 3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 160(%[output]) \n\t" 3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_2], %[step1_5] \n\t" 3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_10] \n\t" 3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_3], %[step1_4] \n\t" 3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_11] \n\t" 3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 320(%[output]) \n\t" 3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 352(%[output]) \n\t" 3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_3], %[step1_4] \n\t" 3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_12] \n\t" 3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_2], %[step1_5] \n\t" 3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_13] \n\t" 3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load5], 384(%[output]) \n\t" 3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sh %[load6], 416(%[output]) \n\t" 3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6) 3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [output] "r" (output), 3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), 3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), 3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), 3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) 4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang input += 16; 4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output += 1; 4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, 408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int dest_stride) { 4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int i; 4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step1_8, step1_9, step1_10, step1_11; 4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step1_12, step1_13, step1_14, step1_15; 4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_0, step2_1, step2_2, step2_3; 4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_8, step2_9, step2_10, step2_11; 4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int step2_12, step2_13, step2_14, step2_15; 4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int load1, load2, load3, load4, load5, load6, load7, load8; 4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int result1, result2, result3, result4; 4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int const_2_power_13 = 8192; 4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dest_pix; 4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *cm = vp9_ff_cropTbl; 4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch vp9_ff_cropTbl */ 4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl); 4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 32); 4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 64); 4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 96); 4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 128); 4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 160); 4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 192); 4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load(vp9_ff_cropTbl + 224); 4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) { 4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dest_pix = (dest + i); 4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load1], 0(%[input]) \n\t" 4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load2], 16(%[input]) \n\t" 4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load3], 8(%[input]) \n\t" 4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load4], 24(%[input]) \n\t" 4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[result1], %[load1], %[load2] \n\t" 4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[result2], %[load1], %[load2] \n\t" 4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[result1], %[cospi_16_64] \n\t" 4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[result2], %[cospi_16_64] \n\t" 4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_0], $ac1, 31 \n\t" 4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_1], $ac2, 31 \n\t" 4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load3], %[cospi_24_64] \n\t" 4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load4], %[cospi_8_64] \n\t" 4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_2], $ac3, 31 \n\t" 4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load3], %[cospi_8_64] \n\t" 4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load4], %[cospi_24_64] \n\t" 4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_3], $ac1, 31 \n\t" 4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_0], %[step2_0], %[step2_3] \n\t" 4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_1], %[step2_1], %[step2_2] \n\t" 4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[step1_2], %[step2_1], %[step2_2] \n\t" 4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[step1_3], %[step2_0], %[step2_3] \n\t" 4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load1] "=&r" (load1), [load2] "=&r" (load2), 4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load3] "=&r" (load3), [load4] "=&r" (load4), 4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), 4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), 4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), 4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) 4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), 4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load5], 2(%[input]) \n\t" 4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load6], 30(%[input]) \n\t" 4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load7], 18(%[input]) \n\t" 4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load8], 14(%[input]) \n\t" 4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_30_64] \n\t" 4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load6], %[cospi_2_64] \n\t" 4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load7], %[cospi_14_64] \n\t" 4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load8], %[cospi_18_64] \n\t" 4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load7], %[cospi_18_64] \n\t" 5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load8], %[cospi_14_64] \n\t" 5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_2_64] \n\t" 5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load6], %[cospi_30_64] \n\t" 5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[result1], %[result2] \n\t" 5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[result4], %[result3] \n\t" 5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load6], %[cospi_24_64] \n\t" 5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load5], %[cospi_8_64] \n\t" 5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load5], %[cospi_24_64] \n\t" 5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_8_64] \n\t" 5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_9], $ac1, 31 \n\t" 5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_14], $ac3, 31 \n\t" 5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_8], %[result1], %[result2] \n\t" 5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_15], %[result4], %[result3] \n\t" 5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load7] "=&r" (load7), [load8] "=&r" (load8), 5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), 5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) 5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), 5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), 5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load1], 10(%[input]) \n\t" 5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load2], 22(%[input]) \n\t" 5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load3], 26(%[input]) \n\t" 5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load4], 6(%[input]) \n\t" 5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load1], %[cospi_22_64] \n\t" 5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load2], %[cospi_10_64] \n\t" 5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load3], %[cospi_6_64] \n\t" 5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load4], %[cospi_26_64] \n\t" 5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load1], %[cospi_10_64] \n\t" 5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load2], %[cospi_22_64] \n\t" 5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load3], %[cospi_26_64] \n\t" 5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load4], %[cospi_6_64] \n\t" 5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load1], %[result2], %[result1] \n\t" 5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load2], %[result4], %[result3] \n\t" 5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load1], %[cospi_24_64] \n\t" 5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load2], %[cospi_8_64] \n\t" 5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load2], %[cospi_24_64] \n\t" 5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load1], %[cospi_8_64] \n\t" 5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_10], $ac1, 31 \n\t" 5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step2_13], $ac3, 31 \n\t" 5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_11], %[result1], %[result2] \n\t" 5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step2_12], %[result4], %[result3] \n\t" 5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load1] "=&r" (load1), [load2] "=&r" (load2), 5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load3] "=&r" (load3), [load4] "=&r" (load4), 5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), 5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) 5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), 6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), 6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load5], 4(%[input]) \n\t" 6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load6], 28(%[input]) \n\t" 6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load7], 20(%[input]) \n\t" 6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lh %[load8], 12(%[input]) \n\t" 6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_28_64] \n\t" 6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac1, %[load6], %[cospi_4_64] \n\t" 6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result1], $ac1, 31 \n\t" 6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load7], %[cospi_12_64] \n\t" 6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "msub $ac3, %[load8], %[cospi_20_64] \n\t" 6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result2], $ac3, 31 \n\t" 6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load7], %[cospi_20_64] \n\t" 6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load8], %[cospi_12_64] \n\t" 6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result3], $ac1, 31 \n\t" 6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_4_64] \n\t" 6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load6], %[cospi_28_64] \n\t" 6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[result4], $ac2, 31 \n\t" 6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[result4], %[result3] \n\t" 6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[result1] \n\t" 6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[result2] \n\t" 6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[result1], %[result2] \n\t" 6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[result3] \n\t" 6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[result4] \n\t" 6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load5], %[cospi_16_64] \n\t" 6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_16_64] \n\t" 6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_5], $ac1, 31 \n\t" 6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_6], $ac3, 31 \n\t" 6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_4], %[result1], %[result2] \n\t" 6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[step1_7], %[result4], %[result3] \n\t" 6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load7] "=&r" (load7), [load8] "=&r" (load8), 6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result1] "=&r" (result1), [result2] "=&r" (result2), 6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [result3] "=&r" (result3), [result4] "=&r" (result4), 6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), 6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) 6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), 6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), 6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac0 \n\t" 6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac0 \n\t" 6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac1 \n\t" 6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step2_14], %[step2_13] \n\t" 6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_9] \n\t" 6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_10] \n\t" 6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac0, %[load5], %[cospi_16_64] \n\t" 6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step2_14], %[step2_13] \n\t" 6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_10] \n\t" 6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_9] \n\t" 6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac1, %[load6], %[cospi_16_64] \n\t" 6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac2 \n\t" 6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[const_2_power_13], $ac3 \n\t" 6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step2_15], %[step2_12] \n\t" 6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step2_8] \n\t" 6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step2_11] \n\t" 6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac2, %[load5], %[cospi_16_64] \n\t" 6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step2_15], %[step2_12] \n\t" 7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step2_11] \n\t" 7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step2_8] \n\t" 7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "madd $ac3, %[load6], %[cospi_16_64] \n\t" 7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_10], $ac0, 31 \n\t" 7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_13], $ac1, 31 \n\t" 7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_11], $ac2, 31 \n\t" 7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[step1_12], $ac3, 31 \n\t" 7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), 7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), 7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) 7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [const_2_power_13] "r" (const_2_power_13), 7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), 7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), 7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), 7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), 7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cospi_16_64] "r" (cospi_16_64) 7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang step1_8 = step2_8 + step2_11; 7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang step1_9 = step2_9 + step2_10; 7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang step1_14 = step2_13 + step2_14; 7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang step1_15 = step2_12 + step2_15; 7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_0], %[step1_7] \n\t" 7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_15] \n\t" 7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_1], %[step1_6] \n\t" 7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_14] \n\t" 7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_2], %[step1_5] \n\t" 7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_13] \n\t" 7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_3], %[step1_4] \n\t" 7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_12] \n\t" 7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_3], %[step1_4] \n\t" 7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_11] \n\t" 7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_2], %[step1_5] \n\t" 7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_10] \n\t" 7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_1], %[step1_6] \n\t" 7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[load5], %[step1_9] \n\t" 7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_0], %[step1_7] \n\t" 7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[load6], %[step1_8] \n\t" 7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 7995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 8045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_0], %[step1_7] \n\t" 8055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_8] \n\t" 8065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 8075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_1], %[step1_6] \n\t" 8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_9] \n\t" 8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 8195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[step1_2], %[step1_5] \n\t" 8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_10] \n\t" 8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 8265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[step1_3], %[step1_4] \n\t" 8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_11] \n\t" 8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_3], %[step1_4] \n\t" 8435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_12] \n\t" 8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 8455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 8465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 8475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 8485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_2], %[step1_5] \n\t" 8495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_13] \n\t" 8505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 8515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 8585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load7], 0(%[dest_pix]) \n\t" 8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load5], %[step1_1], %[step1_6] \n\t" 8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load5], %[load5], %[step1_14] \n\t" 8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load5], %[load5], 32 \n\t" 8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load5], %[load5], 6 \n\t" 8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load7], %[load7], %[load5] \n\t" 8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load5], %[load7](%[cm]) \n\t" 8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load6], %[step1_0], %[step1_7] \n\t" 8685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sub %[load6], %[load6], %[step1_15] \n\t" 8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load5], 0(%[dest_pix]) \n\t" 8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbu %[load8], 0(%[dest_pix]) \n\t" 8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[load6], %[load6], 32 \n\t" 8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[load6], %[load6], 6 \n\t" 8745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[load8], %[load8], %[load6] \n\t" 8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[load6], %[load8](%[cm]) \n\t" 8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[load6], 0(%[dest_pix]) \n\t" 8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), 8795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) 8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [cm] "r" (cm), [dest_stride] "r" (dest_stride), 8815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), 8825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), 8835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), 8845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), 8855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), 8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), 8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), 8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) 8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang input += 16; 8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, 8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int dest_stride) { 8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t pos = 45; 8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* bit positon for extract from acc */ 9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "wrdsp %[pos], 1 \n\t" 9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [pos] "r" (pos) 9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // First transform rows 908b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_rows_dspr2(input, out, 16); 9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // Then transform columns and add to dest 911b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_cols_add_blk_dspr2(out, dest, dest_stride); 9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 9135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 914b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16(const int16_t *input, int16_t *output) { 9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x0 = input[15]; 9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x1 = input[0]; 9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x2 = input[13]; 9205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x3 = input[2]; 9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x4 = input[11]; 9225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x5 = input[4]; 9235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x6 = input[9]; 9245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x7 = input[6]; 9255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x8 = input[7]; 9265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x9 = input[8]; 9275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x10 = input[5]; 9285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x11 = input[10]; 9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x12 = input[3]; 9305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x13 = input[12]; 9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x14 = input[1]; 9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x15 = input[14]; 9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 9355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[0] = output[1] = output[2] = output[3] = output[4] 9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang = output[5] = output[6] = output[7] = output[8] 9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang = output[9] = output[10] = output[11] = output[12] 9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang = output[13] = output[14] = output[15] = 0; 9405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang return; 9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // stage 1 9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 9515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 9565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 9585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 9595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x0 = dct_const_round_shift(s0 + s8); 9625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x1 = dct_const_round_shift(s1 + s9); 9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x2 = dct_const_round_shift(s2 + s10); 9645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x3 = dct_const_round_shift(s3 + s11); 9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x4 = dct_const_round_shift(s4 + s12); 9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x5 = dct_const_round_shift(s5 + s13); 9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x6 = dct_const_round_shift(s6 + s14); 9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x7 = dct_const_round_shift(s7 + s15); 9695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x8 = dct_const_round_shift(s0 - s8); 9705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x9 = dct_const_round_shift(s1 - s9); 9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x10 = dct_const_round_shift(s2 - s10); 9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x11 = dct_const_round_shift(s3 - s11); 9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x12 = dct_const_round_shift(s4 - s12); 9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x13 = dct_const_round_shift(s5 - s13); 9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x14 = dct_const_round_shift(s6 - s14); 9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x15 = dct_const_round_shift(s7 - s15); 9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // stage 2 9795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s0 = x0; 9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s1 = x1; 9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s2 = x2; 9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s3 = x3; 9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s4 = x4; 9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s5 = x5; 9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s6 = x6; 9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s7 = x7; 9875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 9885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 9905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 9915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 9925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 9935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x0 = s0 + s4; 9975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x1 = s1 + s5; 9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x2 = s2 + s6; 9995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x3 = s3 + s7; 10005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x4 = s0 - s4; 10015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x5 = s1 - s5; 10025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x6 = s2 - s6; 10035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x7 = s3 - s7; 10045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x8 = dct_const_round_shift(s8 + s12); 10055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x9 = dct_const_round_shift(s9 + s13); 10065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x10 = dct_const_round_shift(s10 + s14); 10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x11 = dct_const_round_shift(s11 + s15); 10085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x12 = dct_const_round_shift(s8 - s12); 10095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x13 = dct_const_round_shift(s9 - s13); 10105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x14 = dct_const_round_shift(s10 - s14); 10115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x15 = dct_const_round_shift(s11 - s15); 10125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // stage 3 10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s0 = x0; 10155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s1 = x1; 10165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s2 = x2; 10175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s3 = x3; 10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 10225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s8 = x8; 10235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s9 = x9; 10245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s10 = x10; 10255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s11 = x11; 10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 10275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 10285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 10295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 10305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x0 = s0 + s2; 10325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x1 = s1 + s3; 10335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x2 = s0 - s2; 10345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x3 = s1 - s3; 10355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x4 = dct_const_round_shift(s4 + s6); 10365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x5 = dct_const_round_shift(s5 + s7); 10375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x6 = dct_const_round_shift(s4 - s6); 10385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x7 = dct_const_round_shift(s5 - s7); 10395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x8 = s8 + s10; 10405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x9 = s9 + s11; 10415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x10 = s8 - s10; 10425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x11 = s9 - s11; 10435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x12 = dct_const_round_shift(s12 + s14); 10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x13 = dct_const_round_shift(s13 + s15); 10455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x14 = dct_const_round_shift(s12 - s14); 10465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x15 = dct_const_round_shift(s13 - s15); 10475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // stage 4 10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s2 = (- cospi_16_64) * (x2 + x3); 10505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s3 = cospi_16_64 * (x2 - x3); 10515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s6 = cospi_16_64 * (x6 + x7); 10525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s7 = cospi_16_64 * (- x6 + x7); 10535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s10 = cospi_16_64 * (x10 + x11); 10545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s11 = cospi_16_64 * (- x10 + x11); 10555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s14 = (- cospi_16_64) * (x14 + x15); 10565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang s15 = cospi_16_64 * (x14 - x15); 10575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x2 = dct_const_round_shift(s2); 10595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x3 = dct_const_round_shift(s3); 10605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x6 = dct_const_round_shift(s6); 10615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x7 = dct_const_round_shift(s7); 10625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x10 = dct_const_round_shift(s10); 10635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x11 = dct_const_round_shift(s11); 10645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x14 = dct_const_round_shift(s14); 10655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang x15 = dct_const_round_shift(s15); 10665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[0] = x0; 10685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[1] = -x8; 10695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[2] = x12; 10705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[3] = -x4; 10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[4] = x6; 10725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[5] = x14; 10735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[6] = x10; 10745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[7] = x2; 10755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[8] = x3; 10765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[9] = x11; 10775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[10] = x15; 10785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[11] = x7; 10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[12] = x5; 10805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[13] = -x13; 10815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[14] = x9; 10825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang output[15] = -x1; 10835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 10845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, 10865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int pitch, int tx_type) { 10875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int i, j; 10885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 10895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int16_t *outptr = out; 10905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int16_t temp_out[16]; 10915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t pos = 45; 10925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* bit positon for extract from acc */ 10945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 10955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "wrdsp %[pos], 1 \n\t" 10965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 10975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [pos] "r" (pos) 10985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 10995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang switch (tx_type) { 11015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case DCT_DCT: // DCT in both horizontal and vertical 1102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_rows_dspr2(input, outptr, 16); 1103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_cols_add_blk_dspr2(out, dest, pitch); 11045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case ADST_DCT: // ADST in vertical, DCT in horizontal 1106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_rows_dspr2(input, outptr, 16); 11075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr = out; 11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) { 1111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16(outptr, temp_out); 11125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (j = 0; j < 16; ++j) 11145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dest[j * pitch + i] = 11155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 11165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang + dest[j * pitch + i]); 11175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr += 16; 11185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case DCT_ADST: // DCT in vertical, ADST in horizontal 11215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 11225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int16_t temp_in[16 * 16]; 11235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) { 11255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch row */ 11265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load((const uint8_t *)(input + 16)); 11275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16(input, outptr); 11295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang input += 16; 11305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr += 16; 11315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) 11345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (j = 0; j < 16; ++j) 11355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp_in[j * 16 + i] = out[i * 16 + j]; 11365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_cols_add_blk_dspr2(temp_in, dest, pitch); 11385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case ADST_ADST: // ADST in both directions 11415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 11425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int16_t temp_in[16]; 11435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) { 11455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch row */ 11465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vp9_prefetch_load((const uint8_t *)(input + 16)); 11475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16(input, outptr); 11495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang input += 16; 11505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr += 16; 11515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 16; ++i) { 11545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (j = 0; j < 16; ++j) 11555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp_in[j] = out[j * 16 + i]; 1156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian iadst16(temp_in, temp_out); 11575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (j = 0; j < 16; ++j) 11585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dest[j * pitch + i] = 11595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 11605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang + dest[j * pitch + i]); 11615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang default: 11655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); 11665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 11695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, 11715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int dest_stride) { 11725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 11735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int16_t *outptr = out; 11745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t i; 11755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t pos = 45; 11765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* bit positon for extract from acc */ 11785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 11795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "wrdsp %[pos], 1 \n\t" 11805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 11815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [pos] "r" (pos) 11825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 11835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // First transform rows. Since all non-zero dct coefficients are in 11855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // upper-left 4x4 area, we only need to calculate first 4 rows here. 1186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_rows_dspr2(input, outptr, 4); 11875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr += 4; 11895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (i = 0; i < 6; ++i) { 11905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 11915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 0(%[outptr]) \n\t" 11925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 32(%[outptr]) \n\t" 11935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 64(%[outptr]) \n\t" 11945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 96(%[outptr]) \n\t" 11955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 128(%[outptr]) \n\t" 11965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 160(%[outptr]) \n\t" 11975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 192(%[outptr]) \n\t" 11985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 224(%[outptr]) \n\t" 11995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 256(%[outptr]) \n\t" 12005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 288(%[outptr]) \n\t" 12015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 320(%[outptr]) \n\t" 12025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 352(%[outptr]) \n\t" 12035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 384(%[outptr]) \n\t" 12045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 416(%[outptr]) \n\t" 12055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 448(%[outptr]) \n\t" 12065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw $zero, 480(%[outptr]) \n\t" 12075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 12095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [outptr] "r" (outptr) 12105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang outptr += 2; 12135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang // Then transform columns 1216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian idct16_cols_add_blk_dspr2(out, dest, dest_stride); 12175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 12185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, 12205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int dest_stride) { 12215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t pos = 45; 12225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t out; 12235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t r; 12245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t a1, absa1; 12255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t vector_a1; 12265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t t1, t2, t3, t4; 12275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t vector_1, vector_2, vector_3, vector_4; 12285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* bit positon for extract from acc */ 12305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "wrdsp %[pos], 1 \n\t" 12325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 12345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [pos] "r" (pos) 12355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 12385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addi %[out], %[out], 32 \n\t" 12405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sra %[a1], %[out], 6 \n\t" 12415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [out] "+r" (out), [a1] "=r" (a1) 12435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 12445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang if (a1 < 0) { 12475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* use quad-byte 12485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * input and output memory are four byte aligned */ 12495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "abs %[absa1], %[a1] \n\t" 12515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "replv.qb %[vector_a1], %[absa1] \n\t" 12525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 12545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [a1] "r" (a1) 12555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (r = 16; r--;) { 12585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t1], 0(%[dest]) \n\t" 12605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t2], 4(%[dest]) \n\t" 12615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t3], 8(%[dest]) \n\t" 12625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t4], 12(%[dest]) \n\t" 12635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 12645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 12655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 12665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 12675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_1], 0(%[dest]) \n\t" 12685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_2], 4(%[dest]) \n\t" 12695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_3], 8(%[dest]) \n\t" 12705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_4], 12(%[dest]) \n\t" 12715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[dest], %[dest], %[dest_stride] \n\t" 12725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 12745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 12755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 12765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dest] "+&r" (dest) 12775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 12785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } else { 12815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* use quad-byte 12825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * input and output memory are four byte aligned */ 12835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "replv.qb %[vector_a1], %[a1] \n\t" 12855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [vector_a1] "=r" (vector_a1) 12875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [a1] "r" (a1) 12885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (r = 16; r--;) { 12915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 12925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t1], 0(%[dest]) \n\t" 12935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t2], 4(%[dest]) \n\t" 12945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t3], 8(%[dest]) \n\t" 12955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lw %[t4], 12(%[dest]) \n\t" 12965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 12975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 12985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 12995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 13005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_1], 0(%[dest]) \n\t" 13015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_2], 4(%[dest]) \n\t" 13025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_3], 8(%[dest]) \n\t" 13035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[vector_4], 12(%[dest]) \n\t" 13045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "add %[dest], %[dest], %[dest_stride] \n\t" 13055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 13065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 13075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 13085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 13095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dest] "+&r" (dest) 13105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 13115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 13125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 13135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 13145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 13155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif // #if HAVE_DSPR2 1316