1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/inv_txfm_msa.h" 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_transpose_store(const int16_t *input, 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_buf) { 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 1st & 2nd 8x8 */ 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); 207bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, 217bc9febe8749e98a3812a0dc4380ceae75c29450Johann n3); 227bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, 237bc9febe8749e98a3812a0dc4380ceae75c29450Johann n7); 24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); 25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 3rd & 4th 8x8 */ 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); 317bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, 327bc9febe8749e98a3812a0dc4380ceae75c29450Johann n3); 337bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, 347bc9febe8749e98a3812a0dc4380ceae75c29450Johann n7); 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_even_process_store(int16_t *tmp_buf, 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf) { 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 1 */ 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = vec3; 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = vec1; 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 2 */ 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg4; 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg4; 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg6 + reg2; 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg2; 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg1 + reg5; 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg5; 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg7 + reg3; 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg3; 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = vec0; 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg2; 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg3 + reg4; 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg3 - reg4; 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg5 - vec1; 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg5 + vec1; 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 - reg6; 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 + reg6; 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg7 - reg1; 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 + reg1; 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 15 * 8)); 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf)); 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 14 * 8)); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 8)); 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 13 * 8)); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 2 * 8)); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 12 * 8)); 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 3 * 8)); 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 11 * 8)); 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 4 * 8)); 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 10 * 8)); 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 5 * 8)); 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc0, (tmp_eve_buf + 9 * 8)); 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc1, (tmp_eve_buf + 6 * 8)); 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc2, (tmp_eve_buf + 8 * 8)); 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(loc3, (tmp_eve_buf + 7 * 8)); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_row_odd_process_store(int16_t *tmp_buf, 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf) { 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 1 */ 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 8); 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 7 * 8); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 9 * 8); 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 15 * 8); 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 17 * 8); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 23 * 8); 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 25 * 8); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 31 * 8); 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg3; 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg3; 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg7 + reg4; 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg4; 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg1 + reg2; 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg2; 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg6 + reg5; 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg5; 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = vec0; 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD2(reg5, reg4, reg3, reg2, vec0, vec1); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg5, reg4, reg3, reg2, vec0, vec1); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf), 8); 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 2 */ 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 8 loads */ 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 3 * 8); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 5 * 8); 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 11 * 8); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 13 * 8); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 19 * 8); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 21 * 8); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 27 * 8); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 29 * 8); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 2007bc9febe8749e98a3812a0dc4380ceae75c29450Johann ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(reg0, (tmp_odd_buf + 13 * 8)); 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH(reg1, (tmp_odd_buf + 14 * 8)); 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2147bc9febe8749e98a3812a0dc4380ceae75c29450Johann ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg1, reg5, vec0, vec1); 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg2, reg6, reg3, reg7, vec0, vec1); 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2287bc9febe8749e98a3812a0dc4380ceae75c29450Johann ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct_butterfly_transpose_store(int16_t *tmp_buf, 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf, 2417bc9febe8749e98a3812a0dc4380ceae75c29450Johann int16_t *tmp_odd_buf, int16_t *dst) { 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* FINAL BUTTERFLY : Dependency on Even & Odd */ 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf); 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 9 * 8); 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 14 * 8); 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 6 * 8); 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf); 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 8 * 8); 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 4 * 8); 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 12 * 8); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 4 * 8); 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 13 * 8); 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 10 * 8); 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 3 * 8); 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 2 * 8); 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 10 * 8); 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 6 * 8); 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 14 * 8); 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 2 * 8); 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 11 * 8); 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 12 * 8); 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 7 * 8); 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 1 * 8); 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 9 * 8); 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 5 * 8); 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 13 * 8); 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 5 * 8); 298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 15 * 8); 299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 8 * 8); 300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 1 * 8); 301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 3 * 8); 302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 11 * 8); 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 7 * 8); 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 15 * 8); 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Transpose : 16 vectors */ 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 1st & 2nd 8x8 */ 3157bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, 3167bc9febe8749e98a3812a0dc4380ceae75c29450Johann n3); 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (dst + 0), 32); 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3207bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, 3217bc9febe8749e98a3812a0dc4380ceae75c29450Johann n7); 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (dst + 8), 32); 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 3rd & 4th 8x8 */ 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); 3287bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, 3297bc9febe8749e98a3812a0dc4380ceae75c29450Johann n3); 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m0, n0, m1, n1, (dst + 16), 32); 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3337bc9febe8749e98a3812a0dc4380ceae75c29450Johann TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, 3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann n7); 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m4, n4, m5, n5, (dst + 24), 32); 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_transpose_store(input, &tmp_buf[0]); 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); 3477bc9febe8749e98a3812a0dc4380ceae75c29450Johann idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], 3487bc9febe8749e98a3812a0dc4380ceae75c29450Johann output); 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_even_process_store(int16_t *tmp_buf, 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_eve_buf) { 353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 1 */ 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp_buf += (2 * 32); 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = vec3; 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = vec1; 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 2 */ 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 */ 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg4; 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg4; 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg6 + reg2; 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg2; 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg1 + reg5; 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg5; 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg7 + reg3; 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg3; 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = vec0; 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg2; 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg3 + reg4; 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg3 - reg4; 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg5 - vec1; 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = reg5 + vec1; 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 - reg6; 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 + reg6; 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = reg7 - reg1; 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 + reg1; 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, tmp_eve_buf, 8); 415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); 416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Store 8 */ 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); 424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_odd_process_store(int16_t *tmp_buf, 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *tmp_odd_buf) { 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 1 */ 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 32); 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 7 * 32); 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 9 * 32); 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 15 * 32); 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 17 * 32); 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 23 * 32); 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 25 * 32); 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 31 * 32); 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = reg0 + reg3; 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = reg0 - reg3; 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = reg7 + reg4; 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = reg7 - reg4; 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = reg1 + reg2; 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = reg1 - reg2; 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = reg6 + reg5; 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = reg6 - reg5; 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = vec0; 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD2(reg5, reg4, reg3, reg2, vec0, vec1); 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg5, reg4, reg3, reg2, vec0, vec1); 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, tmp_odd_buf, 8); 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); 475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 2 */ 477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 8 loads */ 478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg0 = LD_SH(tmp_buf + 3 * 32); 479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg1 = LD_SH(tmp_buf + 5 * 32); 480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg2 = LD_SH(tmp_buf + 11 * 32); 481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg3 = LD_SH(tmp_buf + 13 * 32); 482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg4 = LD_SH(tmp_buf + 19 * 32); 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg5 = LD_SH(tmp_buf + 21 * 32); 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg6 = LD_SH(tmp_buf + 27 * 32); 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian reg7 = LD_SH(tmp_buf + 29 * 32); 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); 497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); 500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 Stores */ 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); 511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); 512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg1, reg5, vec0, vec1); 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg2, reg6, reg3, reg7, vec0, vec1); 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); 525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); 526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); 528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, 5397bc9febe8749e98a3812a0dc4380ceae75c29450Johann int16_t *tmp_odd_buf, uint8_t *dst, 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* FINAL BUTTERFLY : Dependency on Even & Odd */ 545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf); 546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 9 * 8); 547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 14 * 8); 548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 6 * 8); 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf); 550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 8 * 8); 551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 4 * 8); 552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 12 * 8); 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m0, m2, m4, m6, 6); 556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m0, m2, m4, m6, 6); 5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, 5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann m6); 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 4 * 8); 565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 13 * 8); 566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 10 * 8); 567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 3 * 8); 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 2 * 8); 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 10 * 8); 570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 6 * 8); 571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 14 * 8); 572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m1, m3, m5, m7, 6); 5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); 576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(m1, m3, m5, m7, 6); 5797bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, 5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann m7); 581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 2 * 8); 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 11 * 8); 585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 12 * 8); 586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 7 * 8); 587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 1 * 8); 588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 9 * 8); 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 5 * 8); 590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 13 * 8); 591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n0, n2, n4, n6, 6); 5947bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); 597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n0, n2, n4, n6, 6); 5987bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, 5997bc9febe8749e98a3812a0dc4380ceae75c29450Johann n6); 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* Load 8 & Store 8 */ 602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = LD_SH(tmp_odd_buf + 5 * 8); 603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = LD_SH(tmp_odd_buf + 15 * 8); 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = LD_SH(tmp_odd_buf + 8 * 8); 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = LD_SH(tmp_odd_buf + 1 * 8); 606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc0 = LD_SH(tmp_eve_buf + 3 * 8); 607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc1 = LD_SH(tmp_eve_buf + 11 * 8); 608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc2 = LD_SH(tmp_eve_buf + 7 * 8); 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian loc3 = LD_SH(tmp_eve_buf + 15 * 8); 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n1, n3, n5, n7, 6); 6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); 614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); 616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(n1, n3, n5, n7, 6); 6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, 6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann n7); 619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 6287bc9febe8749e98a3812a0dc4380ceae75c29450Johann idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, 6297bc9febe8749e98a3812a0dc4380ceae75c29450Johann dst_stride); 630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, 633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *out_ptr = out_arr; 637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform rows */ 639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 32 * 8 block */ 641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); 642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform columns */ 645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 8 * 32 block */ 647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst_stride); 649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, 653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t *out_ptr = out_arr; 657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 32; i--;) { 6597bc9febe8749e98a3812a0dc4380ceae75c29450Johann __asm__ __volatile__( 660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 0(%[out_ptr]) \n\t" 661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 4(%[out_ptr]) \n\t" 662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 8(%[out_ptr]) \n\t" 663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 12(%[out_ptr]) \n\t" 664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 16(%[out_ptr]) \n\t" 665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 20(%[out_ptr]) \n\t" 666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 24(%[out_ptr]) \n\t" 667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 28(%[out_ptr]) \n\t" 668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 32(%[out_ptr]) \n\t" 669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 36(%[out_ptr]) \n\t" 670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 40(%[out_ptr]) \n\t" 671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 44(%[out_ptr]) \n\t" 672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 48(%[out_ptr]) \n\t" 673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 52(%[out_ptr]) \n\t" 674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 56(%[out_ptr]) \n\t" 675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian "sw $zero, 60(%[out_ptr]) \n\t" 676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian : 6787bc9febe8749e98a3812a0dc4380ceae75c29450Johann : [out_ptr] "r"(out_ptr)); 679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out_ptr += 32; 681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out_ptr = out_arr; 684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rows: only upper-left 8x8 has non-zero coeff */ 686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct32x8_1d_rows_msa(input, out_ptr); 687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* transform columns */ 689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 0; i < 4; ++i) { 690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* process 8 * 32 block */ 691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst_stride); 693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, 697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride) { 698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t i; 699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t out; 700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; 702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); 704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); 705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = ROUND_POWER_OF_TWO(out, 6); 706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_fill_h(out); 708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (i = 16; i--;) { 710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(dst, 16, dst0, dst1); 711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(dst + dst_stride, 16, dst2, dst3); 712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst0, res0, res4); 714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst1, res1, res5); 715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst2, res2, res6); 716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian UNPCK_UB_SH(dst3, res3, res7); 717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); 718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); 719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CLIP_SH4_0_255(res0, res1, res2, res3); 720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CLIP_SH4_0_255(res4, res5, res6, res7); 7217bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, 7227bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp2, tmp3); 723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_UB2(tmp0, tmp1, dst, 16); 725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST_UB2(tmp2, tmp3, dst, 16); 727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 730