1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vpx_dsp/mips/fwd_txfm_msa.h" 12 13void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, 14 int32_t src_stride) { 15 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 16 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 17 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 18 v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; 19 v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; 20 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; 21 v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, 22 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; 23 v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, 24 cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; 25 v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 26 0, 0, 0, 0 }; 27 28 LD_SH16(input, src_stride, 29 in0, in1, in2, in3, in4, in5, in6, in7, 30 in8, in9, in10, in11, in12, in13, in14, in15); 31 SLLI_4V(in0, in1, in2, in3, 2); 32 SLLI_4V(in4, in5, in6, in7, 2); 33 SLLI_4V(in8, in9, in10, in11, 2); 34 SLLI_4V(in12, in13, in14, in15, 2); 35 ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); 36 ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); 37 FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 38 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 39 ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); 40 SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); 41 SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); 42 43 tmp_ptr += 16; 44 45 /* stp 1 */ 46 ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); 47 ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); 48 49 cnst4 = __msa_splati_h(coeff, 0); 50 stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); 51 52 cnst5 = __msa_splati_h(coeff, 1); 53 cnst5 = __msa_ilvev_h(cnst5, cnst4); 54 stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); 55 stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); 56 stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); 57 58 /* stp2 */ 59 BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); 60 BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); 61 ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); 62 ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); 63 SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); 64 cnst0 = __msa_ilvev_h(cnst0, cnst1); 65 stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); 66 67 cnst0 = __msa_splati_h(coeff, 4); 68 cnst1 = __msa_ilvev_h(cnst1, cnst0); 69 stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); 70 71 BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); 72 ILVRL_H2_SH(in15, in8, vec1, vec0); 73 SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); 74 cnst0 = __msa_ilvev_h(cnst0, cnst1); 75 76 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 77 ST_SH(in8, tmp_ptr); 78 79 cnst0 = __msa_splati_h(coeff2, 0); 80 cnst0 = __msa_ilvev_h(cnst1, cnst0); 81 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 82 ST_SH(in8, tmp_ptr + 224); 83 84 ILVRL_H2_SH(in14, in9, vec1, vec0); 85 SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); 86 cnst1 = __msa_ilvev_h(cnst1, cnst0); 87 88 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); 89 ST_SH(in8, tmp_ptr + 128); 90 91 cnst1 = __msa_splati_h(coeff2, 2); 92 cnst0 = __msa_ilvev_h(cnst0, cnst1); 93 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 94 ST_SH(in8, tmp_ptr + 96); 95 96 SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); 97 cnst1 = __msa_ilvev_h(cnst1, cnst0); 98 99 stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); 100 101 cnst1 = __msa_splati_h(coeff, 3); 102 cnst1 = __msa_ilvev_h(cnst0, cnst1); 103 stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); 104 105 /* stp4 */ 106 ADD2(stp34, stp25, stp33, stp22, in13, in10); 107 108 ILVRL_H2_SH(in13, in10, vec1, vec0); 109 SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); 110 cnst0 = __msa_ilvev_h(cnst0, cnst1); 111 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 112 ST_SH(in8, tmp_ptr + 64); 113 114 cnst0 = __msa_splati_h(coeff2, 1); 115 cnst0 = __msa_ilvev_h(cnst1, cnst0); 116 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 117 ST_SH(in8, tmp_ptr + 160); 118 119 SUB2(stp34, stp25, stp33, stp22, in12, in11); 120 ILVRL_H2_SH(in12, in11, vec1, vec0); 121 SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); 122 cnst1 = __msa_ilvev_h(cnst1, cnst0); 123 124 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); 125 ST_SH(in8, tmp_ptr + 192); 126 127 cnst1 = __msa_splati_h(coeff2, 3); 128 cnst0 = __msa_ilvev_h(cnst0, cnst1); 129 in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); 130 ST_SH(in8, tmp_ptr + 32); 131} 132 133void fdct16x8_1d_row(int16_t *input, int16_t *output) { 134 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 135 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 136 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 137 138 LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); 139 LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); 140 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 141 in0, in1, in2, in3, in4, in5, in6, in7); 142 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, 143 in8, in9, in10, in11, in12, in13, in14, in15); 144 ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); 145 ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); 146 ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); 147 ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); 148 SRA_4V(in0, in1, in2, in3, 2); 149 SRA_4V(in4, in5, in6, in7, 2); 150 SRA_4V(in8, in9, in10, in11, 2); 151 SRA_4V(in12, in13, in14, in15, 2); 152 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 153 in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, 154 tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); 155 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); 156 FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 157 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 158 LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); 159 FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, 160 in0, in1, in2, in3, in4, in5, in6, in7); 161 TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, 162 tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); 163 ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); 164 TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, 165 tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); 166 ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); 167} 168 169void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, 170 int32_t src_stride) { 171 v8i16 in0, in1, in2, in3; 172 173 LD_SH4(input, src_stride, in0, in1, in2, in3); 174 175 /* fdct4 pre-process */ 176 { 177 v8i16 vec, mask; 178 v16i8 zero = { 0 }; 179 v16i8 one = __msa_ldi_b(1); 180 181 mask = (v8i16)__msa_sldi_b(zero, one, 15); 182 SLLI_4V(in0, in1, in2, in3, 4); 183 vec = __msa_ceqi_h(in0, 0); 184 vec = vec ^ 255; 185 vec = mask & vec; 186 in0 += vec; 187 } 188 189 VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); 190 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 191 VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); 192 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 193 ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); 194 SRA_4V(in0, in1, in2, in3, 2); 195 PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); 196 ST_SH2(in0, in2, output, 8); 197} 198 199void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, 200 int32_t src_stride) { 201 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 202 203 LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); 204 SLLI_4V(in0, in1, in2, in3, 2); 205 SLLI_4V(in4, in5, in6, in7, 2); 206 VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 207 in0, in1, in2, in3, in4, in5, in6, in7); 208 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 209 in0, in1, in2, in3, in4, in5, in6, in7); 210 VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 211 in0, in1, in2, in3, in4, in5, in6, in7); 212 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 213 in0, in1, in2, in3, in4, in5, in6, in7); 214 SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); 215 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); 216} 217 218void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { 219 out[0] = LD_HADD(input, stride); 220 out[1] = 0; 221} 222 223void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, 224 int32_t src_stride) { 225 int32_t i; 226 DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); 227 228 /* column transform */ 229 for (i = 0; i < 2; ++i) { 230 fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); 231 } 232 233 /* row transform */ 234 for (i = 0; i < 2; ++i) { 235 fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); 236 } 237} 238 239void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { 240 out[1] = 0; 241 242 out[0] = LD_HADD(input, stride); 243 out[0] += LD_HADD(input + 8, stride); 244 out[0] += LD_HADD(input + 16 * 8, stride); 245 out[0] += LD_HADD(input + 16 * 8 + 8, stride); 246 out[0] >>= 1; 247} 248