1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/inv_txfm_msa.h" 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_transpose_store(const int16_t *input, 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_buf) { 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 1st & 2nd 8x8 */ 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m0, n0, m1, n1, m2, n2, m3, n3); 22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m4, n4, m5, n5, m6, n6, m7, n7); 24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); 25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 3rd & 4th 8x8 */ 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m0, n0, m1, n1, m2, n2, m3, n3); 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m4, n4, m5, n5, m6, n6, m7, n7); 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_even_process_store(int16_t *tmp_buf, 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf) { 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 1 */ 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = vec3; 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = vec1; 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 2 */ 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg4; 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg4; 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg6 + reg2; 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg2; 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg1 + reg5; 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg5; 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg7 + reg3; 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg3; 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = vec0; 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg2; 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg3 + reg4; 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg3 - reg4; 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg5 - vec1; 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg5 + vec1; 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 - reg6; 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 + reg6; 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg7 - reg1; 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 + reg1; 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 15 * 8)); 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf)); 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 14 * 8)); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 8)); 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 13 * 8)); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 2 * 8)); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 12 * 8)); 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 3 * 8)); 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 11 * 8)); 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 4 * 8)); 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 10 * 8)); 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 5 * 8)); 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 9 * 8)); 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 6 * 8)); 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 8 * 8)); 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 7 * 8)); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_odd_process_store(int16_t *tmp_buf, 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf) { 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 1 */ 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 8); 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 7 * 8); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 9 * 8); 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 15 * 8); 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 17 * 8); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 23 * 8); 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 25 * 8); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 31 * 8); 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg3; 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg3; 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg7 + reg4; 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg4; 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg1 + reg2; 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg2; 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg6 + reg5; 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg5; 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = vec0; 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD2(reg5, reg4, reg3, reg2, vec0, vec1); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg5, reg4, reg3, reg2, vec0, vec1); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf), 8); 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 2 */ 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 8 loads */ 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 3 * 8); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 5 * 8); 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 11 * 8); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 13 * 8); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 19 * 8); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 21 * 8); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 27 * 8); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 29 * 8); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0, vec1, vec2, vec3); 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1, vec2, vec0, vec3); 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(reg0, (tmp_odd_buf + 13 * 8)); 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(reg1, (tmp_odd_buf + 14 * 8)); 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0, loc1, loc2, loc3); 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg1, reg5, vec0, vec1); 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg2, reg6, reg3, reg7, vec0, vec1); 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0, loc1, loc2, loc3); 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct_butterfly_transpose_store(int16_t *tmp_buf, 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf, 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf, 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *dst) { 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* FINAL BUTTERFLY : Dependency on Even & Odd */ 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf); 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 9 * 8); 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 14 * 8); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 6 * 8); 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf); 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 8 * 8); 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 4 * 8); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 12 * 8); 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 4 * 8); 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 13 * 8); 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 10 * 8); 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 3 * 8); 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 2 * 8); 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 10 * 8); 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 6 * 8); 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 14 * 8); 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 2 * 8); 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 11 * 8); 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 12 * 8); 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 7 * 8); 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 1 * 8); 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 9 * 8); 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 5 * 8); 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 13 * 8); 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); 298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); 299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); 300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 5 * 8); 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 15 * 8); 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 8 * 8); 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 1 * 8); 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 3 * 8); 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 11 * 8); 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 7 * 8); 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 15 * 8); 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); 315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Transpose : 16 vectors */ 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 1st & 2nd 8x8 */ 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m0, n0, m1, n1, m2, n2, m3, n3); 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (dst + 0), 32); 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m4, n4, m5, n5, m6, n6, m7, n7); 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (dst + 8), 32); 328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 3rd & 4th 8x8 */ 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); 333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m0, n0, m1, n1, m2, n2, m3, n3); 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (dst + 16), 32); 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m4, n4, m5, n5, m6, n6, m7, n7); 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (dst + 24), 32); 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_transpose_store(input, &tmp_buf[0]); 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], 353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &tmp_odd_buf[0], output); 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_even_process_store(int16_t *tmp_buf, 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf) { 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 1 */ 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp_buf += (2 * 32); 365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = vec3; 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = vec1; 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 2 */ 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 */ 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg4; 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg4; 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg6 + reg2; 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg2; 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg1 + reg5; 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg5; 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg7 + reg3; 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg3; 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = vec0; 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg2; 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg3 + reg4; 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg3 - reg4; 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg5 - vec1; 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg5 + vec1; 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 - reg6; 409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 + reg6; 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg7 - reg1; 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 + reg1; 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, tmp_eve_buf, 8); 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); 424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); 430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_odd_process_store(int16_t *tmp_buf, 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf) { 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 1 */ 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 32); 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 7 * 32); 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 9 * 32); 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 15 * 32); 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 17 * 32); 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 23 * 32); 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 25 * 32); 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 31 * 32); 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg3; 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg3; 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg7 + reg4; 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg4; 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg1 + reg2; 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg2; 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg6 + reg5; 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg5; 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = vec0; 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD2(reg5, reg4, reg3, reg2, vec0, vec1); 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg5, reg4, reg3, reg2, vec0, vec1); 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, tmp_odd_buf, 8); 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); 478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); 480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 2 */ 482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 8 loads */ 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 3 * 32); 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 5 * 32); 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 11 * 32); 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 13 * 32); 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 19 * 32); 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 21 * 32); 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 27 * 32); 490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 29 * 32); 491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); 499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); 510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); 511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); 512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg1, reg5, vec0, vec1); 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg2, reg6, reg3, reg7, vec0, vec1); 525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); 527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); 530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, 544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf, 545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, 546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* FINAL BUTTERFLY : Dependency on Even & Odd */ 551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf); 552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 9 * 8); 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 14 * 8); 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 6 * 8); 555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf); 556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 8 * 8); 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 4 * 8); 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 12 * 8); 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m0, m2, m4, m6, 6); 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); 565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m0, m2, m4, m6, 6); 566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), 567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m0, m2, m4, m6); 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 4 * 8); 571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 13 * 8); 572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 10 * 8); 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 3 * 8); 574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 2 * 8); 575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 10 * 8); 576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 6 * 8); 577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 14 * 8); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m1, m3, m5, m7, 6); 581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), 582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m1, m3, m5, m7); 583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); 585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m1, m3, m5, m7, 6); 586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), 587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian m1, m3, m5, m7); 588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 2 * 8); 591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 11 * 8); 592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 12 * 8); 593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 7 * 8); 594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 1 * 8); 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 9 * 8); 596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 5 * 8); 597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 13 * 8); 598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n0, n2, n4, n6, 6); 601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), 602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n0, n2, n4, n6); 603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n0, n2, n4, n6, 6); 606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), 607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n0, n2, n4, n6); 608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 5 * 8); 611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 15 * 8); 612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 8 * 8); 613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 1 * 8); 614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 3 * 8); 615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 11 * 8); 616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 7 * 8); 617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 15 * 8); 618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n1, n3, n5, n7, 6); 621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), 622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n1, n3, n5, n7); 623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); 625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n1, n3, n5, n7, 6); 626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), 627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n1, n3, n5, n7); 628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], 638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst, dst_stride); 639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, 642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *out_ptr = out_arr; 646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform rows */ 648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 32 * 8 block */ 650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); 651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform columns */ 654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 8 * 32 block */ 656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst_stride); 658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, 662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *out_ptr = out_arr; 666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 32; i--;) { 668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __asm__ __volatile__ ( 669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 0(%[out_ptr]) \n\t" 670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 4(%[out_ptr]) \n\t" 671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 8(%[out_ptr]) \n\t" 672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 12(%[out_ptr]) \n\t" 673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 16(%[out_ptr]) \n\t" 674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 20(%[out_ptr]) \n\t" 675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 24(%[out_ptr]) \n\t" 676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 28(%[out_ptr]) \n\t" 677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 32(%[out_ptr]) \n\t" 678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 36(%[out_ptr]) \n\t" 679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 40(%[out_ptr]) \n\t" 680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 44(%[out_ptr]) \n\t" 681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 48(%[out_ptr]) \n\t" 682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 52(%[out_ptr]) \n\t" 683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 56(%[out_ptr]) \n\t" 684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 60(%[out_ptr]) \n\t" 685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian : 687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian : [out_ptr] "r" (out_ptr) 688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ); 689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out_ptr += 32; 691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out_ptr = out_arr; 694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rows: only upper-left 8x8 has non-zero coeff */ 696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_1d_rows_msa(input, out_ptr); 697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform columns */ 699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 8 * 32 block */ 701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst_stride); 703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, 707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t out; 710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; 712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); 714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); 715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO(out, 6); 716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_fill_h(out); 718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 16; i--;) { 720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(dst, 16, dst0, dst1); 721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(dst + dst_stride, 16, dst2, dst3); 722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst0, res0, res4); 724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst1, res1, res5); 725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst2, res2, res6); 726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst3, res3, res7); 727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); 728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); 729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CLIP_SH4_0_255(res0, res1, res2, res3); 730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CLIP_SH4_0_255(res4, res5, res6, res7); 731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, 732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0, tmp1, tmp2, tmp3); 733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_UB2(tmp0, tmp1, dst, 16); 735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_UB2(tmp2, tmp3, dst, 16); 737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 740