1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13 14#include "./vp9_rtcd.h" 15#include "./vpx_dsp_rtcd.h" 16#include "vpx_dsp/txfm_common.h" 17#include "vpx_dsp/x86/fwd_txfm_sse2.h" 18#include "vpx_dsp/x86/txfm_common_sse2.h" 19#include "vpx_ports/mem.h" 20 21static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 22 int stride) { 23 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 24 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 25 __m128i mask; 26 27 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 28 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 29 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 30 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 31 32 in[0] = _mm_slli_epi16(in[0], 4); 33 in[1] = _mm_slli_epi16(in[1], 4); 34 in[2] = _mm_slli_epi16(in[2], 4); 35 in[3] = _mm_slli_epi16(in[3], 4); 36 37 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 38 in[0] = _mm_add_epi16(in[0], mask); 39 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 40} 41 42static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { 43 const __m128i kOne = _mm_set1_epi16(1); 44 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 45 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 46 __m128i out01 = _mm_add_epi16(in01, kOne); 47 __m128i out23 = _mm_add_epi16(in23, kOne); 48 out01 = _mm_srai_epi16(out01, 2); 49 out23 = _mm_srai_epi16(out23, 2); 50 store_output(&out01, (output + 0 * 8)); 51 store_output(&out23, (output + 1 * 8)); 52} 53 54static INLINE void transpose_4x4(__m128i *res) { 55 // Combine and transpose 56 // 00 01 02 03 20 21 22 23 57 // 10 11 12 13 30 31 32 33 58 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 59 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 60 61 // 00 10 01 11 02 12 03 13 62 // 20 30 21 31 22 32 23 33 63 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 64 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 65 66 // 00 10 20 30 01 11 21 31 67 // 02 12 22 32 03 13 23 33 68 // only use the first 4 16-bit integers 69 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 70 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 71} 72 73static void fdct4_sse2(__m128i *in) { 74 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 75 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 76 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 77 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 78 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 79 80 __m128i u[4], v[4]; 81 u[0]=_mm_unpacklo_epi16(in[0], in[1]); 82 u[1]=_mm_unpacklo_epi16(in[3], in[2]); 83 84 v[0] = _mm_add_epi16(u[0], u[1]); 85 v[1] = _mm_sub_epi16(u[0], u[1]); 86 87 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 88 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 89 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 90 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 91 92 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 93 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 94 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 95 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 96 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 97 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 98 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 99 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 100 101 in[0] = _mm_packs_epi32(u[0], u[1]); 102 in[1] = _mm_packs_epi32(u[2], u[3]); 103 transpose_4x4(in); 104} 105 106static void fadst4_sse2(__m128i *in) { 107 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 108 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 109 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 110 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 111 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 112 const __m128i kZero = _mm_set1_epi16(0); 113 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 114 __m128i u[8], v[8]; 115 __m128i in7 = _mm_add_epi16(in[0], in[1]); 116 117 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 118 u[1] = _mm_unpacklo_epi16(in[2], in[3]); 119 u[2] = _mm_unpacklo_epi16(in7, kZero); 120 u[3] = _mm_unpacklo_epi16(in[2], kZero); 121 u[4] = _mm_unpacklo_epi16(in[3], kZero); 122 123 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 124 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 125 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 126 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 127 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 128 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 129 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 130 131 u[0] = _mm_add_epi32(v[0], v[1]); 132 u[1] = _mm_sub_epi32(v[2], v[6]); 133 u[2] = _mm_add_epi32(v[3], v[4]); 134 u[3] = _mm_sub_epi32(u[2], u[0]); 135 u[4] = _mm_slli_epi32(v[5], 2); 136 u[5] = _mm_sub_epi32(u[4], v[5]); 137 u[6] = _mm_add_epi32(u[3], u[5]); 138 139 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 140 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 141 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 142 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 143 144 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 145 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 146 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 147 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 148 149 in[0] = _mm_packs_epi32(u[0], u[2]); 150 in[1] = _mm_packs_epi32(u[1], u[3]); 151 transpose_4x4(in); 152} 153 154void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, 155 int stride, int tx_type) { 156 __m128i in[4]; 157 158 switch (tx_type) { 159 case DCT_DCT: 160 vpx_fdct4x4_sse2(input, output, stride); 161 break; 162 case ADST_DCT: 163 load_buffer_4x4(input, in, stride); 164 fadst4_sse2(in); 165 fdct4_sse2(in); 166 write_buffer_4x4(output, in); 167 break; 168 case DCT_ADST: 169 load_buffer_4x4(input, in, stride); 170 fdct4_sse2(in); 171 fadst4_sse2(in); 172 write_buffer_4x4(output, in); 173 break; 174 case ADST_ADST: 175 load_buffer_4x4(input, in, stride); 176 fadst4_sse2(in); 177 fadst4_sse2(in); 178 write_buffer_4x4(output, in); 179 break; 180 default: 181 assert(0); 182 break; 183 } 184} 185 186void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, 187 int16_t* coeff_ptr, intptr_t n_coeffs, 188 int skip_block, const int16_t* zbin_ptr, 189 const int16_t* round_ptr, const int16_t* quant_ptr, 190 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, 191 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, 192 uint16_t* eob_ptr, 193 const int16_t* scan_ptr, 194 const int16_t* iscan_ptr) { 195 __m128i zero; 196 int pass; 197 // Constants 198 // When we use them, in one case, they are all the same. In all others 199 // it's a pair of them that we need to repeat four times. This is done 200 // by constructing the 32 bit constant corresponding to that pair. 201 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 202 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 203 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 204 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 205 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 206 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 207 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 208 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 209 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 210 // Load input 211 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 212 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 213 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 214 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 215 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 216 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 217 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 218 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 219 __m128i *in[8]; 220 int index = 0; 221 222 (void)scan_ptr; 223 (void)zbin_ptr; 224 (void)quant_shift_ptr; 225 (void)coeff_ptr; 226 227 // Pre-condition input (shift by two) 228 in0 = _mm_slli_epi16(in0, 2); 229 in1 = _mm_slli_epi16(in1, 2); 230 in2 = _mm_slli_epi16(in2, 2); 231 in3 = _mm_slli_epi16(in3, 2); 232 in4 = _mm_slli_epi16(in4, 2); 233 in5 = _mm_slli_epi16(in5, 2); 234 in6 = _mm_slli_epi16(in6, 2); 235 in7 = _mm_slli_epi16(in7, 2); 236 237 in[0] = &in0; 238 in[1] = &in1; 239 in[2] = &in2; 240 in[3] = &in3; 241 in[4] = &in4; 242 in[5] = &in5; 243 in[6] = &in6; 244 in[7] = &in7; 245 246 // We do two passes, first the columns, then the rows. The results of the 247 // first pass are transposed so that the same column code can be reused. The 248 // results of the second pass are also transposed so that the rows (processed 249 // as columns) are put back in row positions. 250 for (pass = 0; pass < 2; pass++) { 251 // To store results of each pass before the transpose. 252 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 253 // Add/subtract 254 const __m128i q0 = _mm_add_epi16(in0, in7); 255 const __m128i q1 = _mm_add_epi16(in1, in6); 256 const __m128i q2 = _mm_add_epi16(in2, in5); 257 const __m128i q3 = _mm_add_epi16(in3, in4); 258 const __m128i q4 = _mm_sub_epi16(in3, in4); 259 const __m128i q5 = _mm_sub_epi16(in2, in5); 260 const __m128i q6 = _mm_sub_epi16(in1, in6); 261 const __m128i q7 = _mm_sub_epi16(in0, in7); 262 // Work on first four results 263 { 264 // Add/subtract 265 const __m128i r0 = _mm_add_epi16(q0, q3); 266 const __m128i r1 = _mm_add_epi16(q1, q2); 267 const __m128i r2 = _mm_sub_epi16(q1, q2); 268 const __m128i r3 = _mm_sub_epi16(q0, q3); 269 // Interleave to do the multiply by constants which gets us into 32bits 270 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 271 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 272 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 273 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 274 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 275 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 276 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 277 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 278 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 279 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 280 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 281 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 282 // dct_const_round_shift 283 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 284 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 285 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 286 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 287 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 288 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 289 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 290 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 291 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 292 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 293 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 294 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 295 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 296 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 297 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 298 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 299 // Combine 300 res0 = _mm_packs_epi32(w0, w1); 301 res4 = _mm_packs_epi32(w2, w3); 302 res2 = _mm_packs_epi32(w4, w5); 303 res6 = _mm_packs_epi32(w6, w7); 304 } 305 // Work on next four results 306 { 307 // Interleave to do the multiply by constants which gets us into 32bits 308 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 309 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 310 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 311 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 312 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 313 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 314 // dct_const_round_shift 315 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 316 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 317 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 318 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 319 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 320 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 321 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 322 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 323 // Combine 324 const __m128i r0 = _mm_packs_epi32(s0, s1); 325 const __m128i r1 = _mm_packs_epi32(s2, s3); 326 // Add/subtract 327 const __m128i x0 = _mm_add_epi16(q4, r0); 328 const __m128i x1 = _mm_sub_epi16(q4, r0); 329 const __m128i x2 = _mm_sub_epi16(q7, r1); 330 const __m128i x3 = _mm_add_epi16(q7, r1); 331 // Interleave to do the multiply by constants which gets us into 32bits 332 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 333 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 334 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 335 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 336 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 337 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 338 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 339 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 340 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 341 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 342 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 343 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 344 // dct_const_round_shift 345 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 346 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 347 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 348 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 349 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 350 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 351 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 352 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 353 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 354 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 355 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 356 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 357 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 358 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 359 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 360 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 361 // Combine 362 res1 = _mm_packs_epi32(w0, w1); 363 res7 = _mm_packs_epi32(w2, w3); 364 res5 = _mm_packs_epi32(w4, w5); 365 res3 = _mm_packs_epi32(w6, w7); 366 } 367 // Transpose the 8x8. 368 { 369 // 00 01 02 03 04 05 06 07 370 // 10 11 12 13 14 15 16 17 371 // 20 21 22 23 24 25 26 27 372 // 30 31 32 33 34 35 36 37 373 // 40 41 42 43 44 45 46 47 374 // 50 51 52 53 54 55 56 57 375 // 60 61 62 63 64 65 66 67 376 // 70 71 72 73 74 75 76 77 377 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 378 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 379 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 380 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 381 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 382 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 383 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 384 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 385 // 00 10 01 11 02 12 03 13 386 // 20 30 21 31 22 32 23 33 387 // 04 14 05 15 06 16 07 17 388 // 24 34 25 35 26 36 27 37 389 // 40 50 41 51 42 52 43 53 390 // 60 70 61 71 62 72 63 73 391 // 54 54 55 55 56 56 57 57 392 // 64 74 65 75 66 76 67 77 393 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 394 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 395 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 396 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 397 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 398 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 399 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 400 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 401 // 00 10 20 30 01 11 21 31 402 // 40 50 60 70 41 51 61 71 403 // 02 12 22 32 03 13 23 33 404 // 42 52 62 72 43 53 63 73 405 // 04 14 24 34 05 15 21 36 406 // 44 54 64 74 45 55 61 76 407 // 06 16 26 36 07 17 27 37 408 // 46 56 66 76 47 57 67 77 409 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 410 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 411 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 412 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 413 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 414 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 415 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 416 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 417 // 00 10 20 30 40 50 60 70 418 // 01 11 21 31 41 51 61 71 419 // 02 12 22 32 42 52 62 72 420 // 03 13 23 33 43 53 63 73 421 // 04 14 24 34 44 54 64 74 422 // 05 15 25 35 45 55 65 75 423 // 06 16 26 36 46 56 66 76 424 // 07 17 27 37 47 57 67 77 425 } 426 } 427 // Post-condition output and store it 428 { 429 // Post-condition (division by two) 430 // division of two 16 bits signed numbers using shifts 431 // n / 2 = (n - (n >> 15)) >> 1 432 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 433 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 434 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 435 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 436 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 437 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 438 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 439 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 440 in0 = _mm_sub_epi16(in0, sign_in0); 441 in1 = _mm_sub_epi16(in1, sign_in1); 442 in2 = _mm_sub_epi16(in2, sign_in2); 443 in3 = _mm_sub_epi16(in3, sign_in3); 444 in4 = _mm_sub_epi16(in4, sign_in4); 445 in5 = _mm_sub_epi16(in5, sign_in5); 446 in6 = _mm_sub_epi16(in6, sign_in6); 447 in7 = _mm_sub_epi16(in7, sign_in7); 448 in0 = _mm_srai_epi16(in0, 1); 449 in1 = _mm_srai_epi16(in1, 1); 450 in2 = _mm_srai_epi16(in2, 1); 451 in3 = _mm_srai_epi16(in3, 1); 452 in4 = _mm_srai_epi16(in4, 1); 453 in5 = _mm_srai_epi16(in5, 1); 454 in6 = _mm_srai_epi16(in6, 1); 455 in7 = _mm_srai_epi16(in7, 1); 456 } 457 458 iscan_ptr += n_coeffs; 459 qcoeff_ptr += n_coeffs; 460 dqcoeff_ptr += n_coeffs; 461 n_coeffs = -n_coeffs; 462 zero = _mm_setzero_si128(); 463 464 if (!skip_block) { 465 __m128i eob; 466 __m128i round, quant, dequant; 467 { 468 __m128i coeff0, coeff1; 469 470 // Setup global values 471 { 472 round = _mm_load_si128((const __m128i*)round_ptr); 473 quant = _mm_load_si128((const __m128i*)quant_ptr); 474 dequant = _mm_load_si128((const __m128i*)dequant_ptr); 475 } 476 477 { 478 __m128i coeff0_sign, coeff1_sign; 479 __m128i qcoeff0, qcoeff1; 480 __m128i qtmp0, qtmp1; 481 // Do DC and first 15 AC 482 coeff0 = *in[0]; 483 coeff1 = *in[1]; 484 485 // Poor man's sign extract 486 coeff0_sign = _mm_srai_epi16(coeff0, 15); 487 coeff1_sign = _mm_srai_epi16(coeff1, 15); 488 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 489 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 490 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 491 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 492 493 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 494 round = _mm_unpackhi_epi64(round, round); 495 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 496 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 497 quant = _mm_unpackhi_epi64(quant, quant); 498 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 499 500 // Reinsert signs 501 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 502 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 503 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 504 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 505 506 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); 507 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 508 509 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 510 dequant = _mm_unpackhi_epi64(dequant, dequant); 511 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 512 513 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); 514 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 515 } 516 517 { 518 // Scan for eob 519 __m128i zero_coeff0, zero_coeff1; 520 __m128i nzero_coeff0, nzero_coeff1; 521 __m128i iscan0, iscan1; 522 __m128i eob1; 523 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 524 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 525 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 526 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 527 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); 528 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); 529 // Add one to convert from indices to counts 530 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 531 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 532 eob = _mm_and_si128(iscan0, nzero_coeff0); 533 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 534 eob = _mm_max_epi16(eob, eob1); 535 } 536 n_coeffs += 8 * 2; 537 } 538 539 // AC only loop 540 index = 2; 541 while (n_coeffs < 0) { 542 __m128i coeff0, coeff1; 543 { 544 __m128i coeff0_sign, coeff1_sign; 545 __m128i qcoeff0, qcoeff1; 546 __m128i qtmp0, qtmp1; 547 548 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); 549 coeff0 = *in[index]; 550 coeff1 = *in[index + 1]; 551 552 // Poor man's sign extract 553 coeff0_sign = _mm_srai_epi16(coeff0, 15); 554 coeff1_sign = _mm_srai_epi16(coeff1, 15); 555 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 556 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 557 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 558 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 559 560 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 561 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 562 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 563 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 564 565 // Reinsert signs 566 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 567 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 568 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 569 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 570 571 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); 572 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 573 574 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 575 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 576 577 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); 578 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 579 } 580 581 { 582 // Scan for eob 583 __m128i zero_coeff0, zero_coeff1; 584 __m128i nzero_coeff0, nzero_coeff1; 585 __m128i iscan0, iscan1; 586 __m128i eob0, eob1; 587 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 588 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 589 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 590 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 591 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); 592 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); 593 // Add one to convert from indices to counts 594 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 595 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 596 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 597 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 598 eob0 = _mm_max_epi16(eob0, eob1); 599 eob = _mm_max_epi16(eob, eob0); 600 } 601 n_coeffs += 8 * 2; 602 index += 2; 603 } 604 605 // Accumulate EOB 606 { 607 __m128i eob_shuffled; 608 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 609 eob = _mm_max_epi16(eob, eob_shuffled); 610 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 611 eob = _mm_max_epi16(eob, eob_shuffled); 612 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 613 eob = _mm_max_epi16(eob, eob_shuffled); 614 *eob_ptr = _mm_extract_epi16(eob, 1); 615 } 616 } else { 617 do { 618 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); 619 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); 620 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); 621 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); 622 n_coeffs += 8 * 2; 623 } while (n_coeffs < 0); 624 *eob_ptr = 0; 625 } 626} 627 628// load 8x8 array 629static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 630 int stride) { 631 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 632 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 633 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 634 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 635 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 636 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 637 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 638 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 639 640 in[0] = _mm_slli_epi16(in[0], 2); 641 in[1] = _mm_slli_epi16(in[1], 2); 642 in[2] = _mm_slli_epi16(in[2], 2); 643 in[3] = _mm_slli_epi16(in[3], 2); 644 in[4] = _mm_slli_epi16(in[4], 2); 645 in[5] = _mm_slli_epi16(in[5], 2); 646 in[6] = _mm_slli_epi16(in[6], 2); 647 in[7] = _mm_slli_epi16(in[7], 2); 648} 649 650// right shift and rounding 651static INLINE void right_shift_8x8(__m128i *res, const int bit) { 652 __m128i sign0 = _mm_srai_epi16(res[0], 15); 653 __m128i sign1 = _mm_srai_epi16(res[1], 15); 654 __m128i sign2 = _mm_srai_epi16(res[2], 15); 655 __m128i sign3 = _mm_srai_epi16(res[3], 15); 656 __m128i sign4 = _mm_srai_epi16(res[4], 15); 657 __m128i sign5 = _mm_srai_epi16(res[5], 15); 658 __m128i sign6 = _mm_srai_epi16(res[6], 15); 659 __m128i sign7 = _mm_srai_epi16(res[7], 15); 660 661 if (bit == 2) { 662 const __m128i const_rounding = _mm_set1_epi16(1); 663 res[0] = _mm_add_epi16(res[0], const_rounding); 664 res[1] = _mm_add_epi16(res[1], const_rounding); 665 res[2] = _mm_add_epi16(res[2], const_rounding); 666 res[3] = _mm_add_epi16(res[3], const_rounding); 667 res[4] = _mm_add_epi16(res[4], const_rounding); 668 res[5] = _mm_add_epi16(res[5], const_rounding); 669 res[6] = _mm_add_epi16(res[6], const_rounding); 670 res[7] = _mm_add_epi16(res[7], const_rounding); 671 } 672 673 res[0] = _mm_sub_epi16(res[0], sign0); 674 res[1] = _mm_sub_epi16(res[1], sign1); 675 res[2] = _mm_sub_epi16(res[2], sign2); 676 res[3] = _mm_sub_epi16(res[3], sign3); 677 res[4] = _mm_sub_epi16(res[4], sign4); 678 res[5] = _mm_sub_epi16(res[5], sign5); 679 res[6] = _mm_sub_epi16(res[6], sign6); 680 res[7] = _mm_sub_epi16(res[7], sign7); 681 682 if (bit == 1) { 683 res[0] = _mm_srai_epi16(res[0], 1); 684 res[1] = _mm_srai_epi16(res[1], 1); 685 res[2] = _mm_srai_epi16(res[2], 1); 686 res[3] = _mm_srai_epi16(res[3], 1); 687 res[4] = _mm_srai_epi16(res[4], 1); 688 res[5] = _mm_srai_epi16(res[5], 1); 689 res[6] = _mm_srai_epi16(res[6], 1); 690 res[7] = _mm_srai_epi16(res[7], 1); 691 } else { 692 res[0] = _mm_srai_epi16(res[0], 2); 693 res[1] = _mm_srai_epi16(res[1], 2); 694 res[2] = _mm_srai_epi16(res[2], 2); 695 res[3] = _mm_srai_epi16(res[3], 2); 696 res[4] = _mm_srai_epi16(res[4], 2); 697 res[5] = _mm_srai_epi16(res[5], 2); 698 res[6] = _mm_srai_epi16(res[6], 2); 699 res[7] = _mm_srai_epi16(res[7], 2); 700 } 701} 702 703// write 8x8 array 704static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, 705 int stride) { 706 store_output(&res[0], (output + 0 * stride)); 707 store_output(&res[1], (output + 1 * stride)); 708 store_output(&res[2], (output + 2 * stride)); 709 store_output(&res[3], (output + 3 * stride)); 710 store_output(&res[4], (output + 4 * stride)); 711 store_output(&res[5], (output + 5 * stride)); 712 store_output(&res[6], (output + 6 * stride)); 713 store_output(&res[7], (output + 7 * stride)); 714} 715 716// perform in-place transpose 717static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 718 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 719 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 720 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 721 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 722 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 723 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 724 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 725 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 726 // 00 10 01 11 02 12 03 13 727 // 20 30 21 31 22 32 23 33 728 // 04 14 05 15 06 16 07 17 729 // 24 34 25 35 26 36 27 37 730 // 40 50 41 51 42 52 43 53 731 // 60 70 61 71 62 72 63 73 732 // 44 54 45 55 46 56 47 57 733 // 64 74 65 75 66 76 67 77 734 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 735 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 736 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 737 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 738 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 739 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 740 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 741 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 742 // 00 10 20 30 01 11 21 31 743 // 40 50 60 70 41 51 61 71 744 // 02 12 22 32 03 13 23 33 745 // 42 52 62 72 43 53 63 73 746 // 04 14 24 34 05 15 25 35 747 // 44 54 64 74 45 55 65 75 748 // 06 16 26 36 07 17 27 37 749 // 46 56 66 76 47 57 67 77 750 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 751 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 752 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 753 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 754 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 755 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 756 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 757 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 758 // 00 10 20 30 40 50 60 70 759 // 01 11 21 31 41 51 61 71 760 // 02 12 22 32 42 52 62 72 761 // 03 13 23 33 43 53 63 73 762 // 04 14 24 34 44 54 64 74 763 // 05 15 25 35 45 55 65 75 764 // 06 16 26 36 46 56 66 76 765 // 07 17 27 37 47 57 67 77 766} 767 768static void fdct8_sse2(__m128i *in) { 769 // constants 770 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 771 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 772 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 773 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 774 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 775 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 776 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 777 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 778 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 779 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 780 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 781 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 782 783 // stage 1 784 s0 = _mm_add_epi16(in[0], in[7]); 785 s1 = _mm_add_epi16(in[1], in[6]); 786 s2 = _mm_add_epi16(in[2], in[5]); 787 s3 = _mm_add_epi16(in[3], in[4]); 788 s4 = _mm_sub_epi16(in[3], in[4]); 789 s5 = _mm_sub_epi16(in[2], in[5]); 790 s6 = _mm_sub_epi16(in[1], in[6]); 791 s7 = _mm_sub_epi16(in[0], in[7]); 792 793 u0 = _mm_add_epi16(s0, s3); 794 u1 = _mm_add_epi16(s1, s2); 795 u2 = _mm_sub_epi16(s1, s2); 796 u3 = _mm_sub_epi16(s0, s3); 797 // interleave and perform butterfly multiplication/addition 798 v0 = _mm_unpacklo_epi16(u0, u1); 799 v1 = _mm_unpackhi_epi16(u0, u1); 800 v2 = _mm_unpacklo_epi16(u2, u3); 801 v3 = _mm_unpackhi_epi16(u2, u3); 802 803 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 804 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 805 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 806 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 807 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 808 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 809 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 810 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 811 812 // shift and rounding 813 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 814 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 815 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 816 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 817 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 818 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 819 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 820 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 821 822 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 823 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 824 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 825 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 826 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 827 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 828 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 829 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 830 831 in[0] = _mm_packs_epi32(u0, u1); 832 in[2] = _mm_packs_epi32(u4, u5); 833 in[4] = _mm_packs_epi32(u2, u3); 834 in[6] = _mm_packs_epi32(u6, u7); 835 836 // stage 2 837 // interleave and perform butterfly multiplication/addition 838 u0 = _mm_unpacklo_epi16(s6, s5); 839 u1 = _mm_unpackhi_epi16(s6, s5); 840 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 841 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 842 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 843 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 844 845 // shift and rounding 846 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 847 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 848 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 849 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 850 851 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 852 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 853 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 854 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 855 856 u0 = _mm_packs_epi32(v0, v1); 857 u1 = _mm_packs_epi32(v2, v3); 858 859 // stage 3 860 s0 = _mm_add_epi16(s4, u0); 861 s1 = _mm_sub_epi16(s4, u0); 862 s2 = _mm_sub_epi16(s7, u1); 863 s3 = _mm_add_epi16(s7, u1); 864 865 // stage 4 866 u0 = _mm_unpacklo_epi16(s0, s3); 867 u1 = _mm_unpackhi_epi16(s0, s3); 868 u2 = _mm_unpacklo_epi16(s1, s2); 869 u3 = _mm_unpackhi_epi16(s1, s2); 870 871 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 872 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 873 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 874 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 875 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 876 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 877 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 878 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 879 880 // shift and rounding 881 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 882 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 883 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 884 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 885 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 886 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 887 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 888 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 889 890 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 891 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 892 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 893 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 894 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 895 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 896 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 897 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 898 899 in[1] = _mm_packs_epi32(v0, v1); 900 in[3] = _mm_packs_epi32(v4, v5); 901 in[5] = _mm_packs_epi32(v2, v3); 902 in[7] = _mm_packs_epi32(v6, v7); 903 904 // transpose 905 array_transpose_8x8(in, in); 906} 907 908static void fadst8_sse2(__m128i *in) { 909 // Constants 910 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 911 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 912 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 913 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 914 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 915 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 916 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 917 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 918 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 919 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 920 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 921 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 922 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 923 const __m128i k__const_0 = _mm_set1_epi16(0); 924 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 925 926 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 927 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 928 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 929 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 930 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 931 932 // properly aligned for butterfly input 933 in0 = in[7]; 934 in1 = in[0]; 935 in2 = in[5]; 936 in3 = in[2]; 937 in4 = in[3]; 938 in5 = in[4]; 939 in6 = in[1]; 940 in7 = in[6]; 941 942 // column transformation 943 // stage 1 944 // interleave and multiply/add into 32-bit integer 945 s0 = _mm_unpacklo_epi16(in0, in1); 946 s1 = _mm_unpackhi_epi16(in0, in1); 947 s2 = _mm_unpacklo_epi16(in2, in3); 948 s3 = _mm_unpackhi_epi16(in2, in3); 949 s4 = _mm_unpacklo_epi16(in4, in5); 950 s5 = _mm_unpackhi_epi16(in4, in5); 951 s6 = _mm_unpacklo_epi16(in6, in7); 952 s7 = _mm_unpackhi_epi16(in6, in7); 953 954 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 955 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 956 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 957 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 958 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 959 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 960 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 961 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 962 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 963 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 964 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 965 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 966 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 967 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 968 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 969 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 970 971 // addition 972 w0 = _mm_add_epi32(u0, u8); 973 w1 = _mm_add_epi32(u1, u9); 974 w2 = _mm_add_epi32(u2, u10); 975 w3 = _mm_add_epi32(u3, u11); 976 w4 = _mm_add_epi32(u4, u12); 977 w5 = _mm_add_epi32(u5, u13); 978 w6 = _mm_add_epi32(u6, u14); 979 w7 = _mm_add_epi32(u7, u15); 980 w8 = _mm_sub_epi32(u0, u8); 981 w9 = _mm_sub_epi32(u1, u9); 982 w10 = _mm_sub_epi32(u2, u10); 983 w11 = _mm_sub_epi32(u3, u11); 984 w12 = _mm_sub_epi32(u4, u12); 985 w13 = _mm_sub_epi32(u5, u13); 986 w14 = _mm_sub_epi32(u6, u14); 987 w15 = _mm_sub_epi32(u7, u15); 988 989 // shift and rounding 990 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 991 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 992 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 993 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 994 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 995 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 996 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 997 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 998 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 999 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 1000 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 1001 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 1002 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 1003 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 1004 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 1005 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 1006 1007 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1008 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1009 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1010 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1011 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1012 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1013 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1014 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1015 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 1016 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 1017 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 1018 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 1019 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 1020 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 1021 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 1022 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 1023 1024 // back to 16-bit and pack 8 integers into __m128i 1025 in[0] = _mm_packs_epi32(u0, u1); 1026 in[1] = _mm_packs_epi32(u2, u3); 1027 in[2] = _mm_packs_epi32(u4, u5); 1028 in[3] = _mm_packs_epi32(u6, u7); 1029 in[4] = _mm_packs_epi32(u8, u9); 1030 in[5] = _mm_packs_epi32(u10, u11); 1031 in[6] = _mm_packs_epi32(u12, u13); 1032 in[7] = _mm_packs_epi32(u14, u15); 1033 1034 // stage 2 1035 s0 = _mm_add_epi16(in[0], in[2]); 1036 s1 = _mm_add_epi16(in[1], in[3]); 1037 s2 = _mm_sub_epi16(in[0], in[2]); 1038 s3 = _mm_sub_epi16(in[1], in[3]); 1039 u0 = _mm_unpacklo_epi16(in[4], in[5]); 1040 u1 = _mm_unpackhi_epi16(in[4], in[5]); 1041 u2 = _mm_unpacklo_epi16(in[6], in[7]); 1042 u3 = _mm_unpackhi_epi16(in[6], in[7]); 1043 1044 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 1045 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 1046 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 1047 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 1048 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 1049 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 1050 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 1051 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 1052 1053 w0 = _mm_add_epi32(v0, v4); 1054 w1 = _mm_add_epi32(v1, v5); 1055 w2 = _mm_add_epi32(v2, v6); 1056 w3 = _mm_add_epi32(v3, v7); 1057 w4 = _mm_sub_epi32(v0, v4); 1058 w5 = _mm_sub_epi32(v1, v5); 1059 w6 = _mm_sub_epi32(v2, v6); 1060 w7 = _mm_sub_epi32(v3, v7); 1061 1062 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1063 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1064 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1065 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1066 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1067 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1068 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1069 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1070 1071 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1072 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1073 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1074 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1075 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1076 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1077 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1078 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1079 1080 // back to 16-bit intergers 1081 s4 = _mm_packs_epi32(u0, u1); 1082 s5 = _mm_packs_epi32(u2, u3); 1083 s6 = _mm_packs_epi32(u4, u5); 1084 s7 = _mm_packs_epi32(u6, u7); 1085 1086 // stage 3 1087 u0 = _mm_unpacklo_epi16(s2, s3); 1088 u1 = _mm_unpackhi_epi16(s2, s3); 1089 u2 = _mm_unpacklo_epi16(s6, s7); 1090 u3 = _mm_unpackhi_epi16(s6, s7); 1091 1092 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1093 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1094 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1095 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1096 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1097 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1098 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1099 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1100 1101 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1102 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1103 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1104 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1105 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1106 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1107 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1108 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1109 1110 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1111 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1112 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1113 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1114 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1115 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1116 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1117 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1118 1119 s2 = _mm_packs_epi32(v0, v1); 1120 s3 = _mm_packs_epi32(v2, v3); 1121 s6 = _mm_packs_epi32(v4, v5); 1122 s7 = _mm_packs_epi32(v6, v7); 1123 1124 // FIXME(jingning): do subtract using bit inversion? 1125 in[0] = s0; 1126 in[1] = _mm_sub_epi16(k__const_0, s4); 1127 in[2] = s6; 1128 in[3] = _mm_sub_epi16(k__const_0, s2); 1129 in[4] = s3; 1130 in[5] = _mm_sub_epi16(k__const_0, s7); 1131 in[6] = s5; 1132 in[7] = _mm_sub_epi16(k__const_0, s1); 1133 1134 // transpose 1135 array_transpose_8x8(in, in); 1136} 1137 1138void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, 1139 int stride, int tx_type) { 1140 __m128i in[8]; 1141 1142 switch (tx_type) { 1143 case DCT_DCT: 1144 vpx_fdct8x8_sse2(input, output, stride); 1145 break; 1146 case ADST_DCT: 1147 load_buffer_8x8(input, in, stride); 1148 fadst8_sse2(in); 1149 fdct8_sse2(in); 1150 right_shift_8x8(in, 1); 1151 write_buffer_8x8(output, in, 8); 1152 break; 1153 case DCT_ADST: 1154 load_buffer_8x8(input, in, stride); 1155 fdct8_sse2(in); 1156 fadst8_sse2(in); 1157 right_shift_8x8(in, 1); 1158 write_buffer_8x8(output, in, 8); 1159 break; 1160 case ADST_ADST: 1161 load_buffer_8x8(input, in, stride); 1162 fadst8_sse2(in); 1163 fadst8_sse2(in); 1164 right_shift_8x8(in, 1); 1165 write_buffer_8x8(output, in, 8); 1166 break; 1167 default: 1168 assert(0); 1169 break; 1170 } 1171} 1172 1173static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, 1174 __m128i *in1, int stride) { 1175 // load first 8 columns 1176 load_buffer_8x8(input, in0, stride); 1177 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1178 1179 input += 8; 1180 // load second 8 columns 1181 load_buffer_8x8(input, in1, stride); 1182 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1183} 1184 1185static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, 1186 __m128i *in1, int stride) { 1187 // write first 8 columns 1188 write_buffer_8x8(output, in0, stride); 1189 write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 1190 // write second 8 columns 1191 output += 8; 1192 write_buffer_8x8(output, in1, stride); 1193 write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 1194} 1195 1196static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1197 __m128i tbuf[8]; 1198 array_transpose_8x8(res0, res0); 1199 array_transpose_8x8(res1, tbuf); 1200 array_transpose_8x8(res0 + 8, res1); 1201 array_transpose_8x8(res1 + 8, res1 + 8); 1202 1203 res0[8] = tbuf[0]; 1204 res0[9] = tbuf[1]; 1205 res0[10] = tbuf[2]; 1206 res0[11] = tbuf[3]; 1207 res0[12] = tbuf[4]; 1208 res0[13] = tbuf[5]; 1209 res0[14] = tbuf[6]; 1210 res0[15] = tbuf[7]; 1211} 1212 1213static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 1214 // perform rounding operations 1215 right_shift_8x8(res0, 2); 1216 right_shift_8x8(res0 + 8, 2); 1217 right_shift_8x8(res1, 2); 1218 right_shift_8x8(res1 + 8, 2); 1219} 1220 1221static void fdct16_8col(__m128i *in) { 1222 // perform 16x16 1-D DCT for 8 columns 1223 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1224 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1225 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1226 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1227 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1228 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); 1229 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1230 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1231 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1232 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1233 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1234 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1235 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1236 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1237 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1238 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1239 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1240 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1241 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1242 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1243 1244 // stage 1 1245 i[0] = _mm_add_epi16(in[0], in[15]); 1246 i[1] = _mm_add_epi16(in[1], in[14]); 1247 i[2] = _mm_add_epi16(in[2], in[13]); 1248 i[3] = _mm_add_epi16(in[3], in[12]); 1249 i[4] = _mm_add_epi16(in[4], in[11]); 1250 i[5] = _mm_add_epi16(in[5], in[10]); 1251 i[6] = _mm_add_epi16(in[6], in[9]); 1252 i[7] = _mm_add_epi16(in[7], in[8]); 1253 1254 s[0] = _mm_sub_epi16(in[7], in[8]); 1255 s[1] = _mm_sub_epi16(in[6], in[9]); 1256 s[2] = _mm_sub_epi16(in[5], in[10]); 1257 s[3] = _mm_sub_epi16(in[4], in[11]); 1258 s[4] = _mm_sub_epi16(in[3], in[12]); 1259 s[5] = _mm_sub_epi16(in[2], in[13]); 1260 s[6] = _mm_sub_epi16(in[1], in[14]); 1261 s[7] = _mm_sub_epi16(in[0], in[15]); 1262 1263 p[0] = _mm_add_epi16(i[0], i[7]); 1264 p[1] = _mm_add_epi16(i[1], i[6]); 1265 p[2] = _mm_add_epi16(i[2], i[5]); 1266 p[3] = _mm_add_epi16(i[3], i[4]); 1267 p[4] = _mm_sub_epi16(i[3], i[4]); 1268 p[5] = _mm_sub_epi16(i[2], i[5]); 1269 p[6] = _mm_sub_epi16(i[1], i[6]); 1270 p[7] = _mm_sub_epi16(i[0], i[7]); 1271 1272 u[0] = _mm_add_epi16(p[0], p[3]); 1273 u[1] = _mm_add_epi16(p[1], p[2]); 1274 u[2] = _mm_sub_epi16(p[1], p[2]); 1275 u[3] = _mm_sub_epi16(p[0], p[3]); 1276 1277 v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1278 v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1279 v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1280 v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1281 1282 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1283 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1284 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1285 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1286 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1287 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1288 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1289 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1290 1291 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1292 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1293 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1294 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1295 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1296 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1297 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1298 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1299 1300 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1301 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1302 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1303 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1304 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1305 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1306 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1307 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1308 1309 in[0] = _mm_packs_epi32(u[0], u[1]); 1310 in[4] = _mm_packs_epi32(u[4], u[5]); 1311 in[8] = _mm_packs_epi32(u[2], u[3]); 1312 in[12] = _mm_packs_epi32(u[6], u[7]); 1313 1314 u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1315 u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1316 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1317 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1318 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1319 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1320 1321 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1322 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1323 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1324 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1325 1326 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1327 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1328 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1329 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1330 1331 u[0] = _mm_packs_epi32(v[0], v[1]); 1332 u[1] = _mm_packs_epi32(v[2], v[3]); 1333 1334 t[0] = _mm_add_epi16(p[4], u[0]); 1335 t[1] = _mm_sub_epi16(p[4], u[0]); 1336 t[2] = _mm_sub_epi16(p[7], u[1]); 1337 t[3] = _mm_add_epi16(p[7], u[1]); 1338 1339 u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1340 u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1341 u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1342 u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1343 1344 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1345 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1346 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1347 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1348 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1349 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1350 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1351 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1352 1353 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1354 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1355 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1356 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1357 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1358 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1359 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1360 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1361 1362 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1363 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1364 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1365 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1366 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1367 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1368 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1369 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1370 1371 in[2] = _mm_packs_epi32(v[0], v[1]); 1372 in[6] = _mm_packs_epi32(v[4], v[5]); 1373 in[10] = _mm_packs_epi32(v[2], v[3]); 1374 in[14] = _mm_packs_epi32(v[6], v[7]); 1375 1376 // stage 2 1377 u[0] = _mm_unpacklo_epi16(s[2], s[5]); 1378 u[1] = _mm_unpackhi_epi16(s[2], s[5]); 1379 u[2] = _mm_unpacklo_epi16(s[3], s[4]); 1380 u[3] = _mm_unpackhi_epi16(s[3], s[4]); 1381 1382 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1383 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1384 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1385 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1386 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1387 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1388 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1389 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1390 1391 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1392 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1393 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1394 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1395 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1396 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1397 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1398 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1399 1400 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1401 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1402 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1403 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1404 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1405 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1406 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1407 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1408 1409 t[2] = _mm_packs_epi32(v[0], v[1]); 1410 t[3] = _mm_packs_epi32(v[2], v[3]); 1411 t[4] = _mm_packs_epi32(v[4], v[5]); 1412 t[5] = _mm_packs_epi32(v[6], v[7]); 1413 1414 // stage 3 1415 p[0] = _mm_add_epi16(s[0], t[3]); 1416 p[1] = _mm_add_epi16(s[1], t[2]); 1417 p[2] = _mm_sub_epi16(s[1], t[2]); 1418 p[3] = _mm_sub_epi16(s[0], t[3]); 1419 p[4] = _mm_sub_epi16(s[7], t[4]); 1420 p[5] = _mm_sub_epi16(s[6], t[5]); 1421 p[6] = _mm_add_epi16(s[6], t[5]); 1422 p[7] = _mm_add_epi16(s[7], t[4]); 1423 1424 // stage 4 1425 u[0] = _mm_unpacklo_epi16(p[1], p[6]); 1426 u[1] = _mm_unpackhi_epi16(p[1], p[6]); 1427 u[2] = _mm_unpacklo_epi16(p[2], p[5]); 1428 u[3] = _mm_unpackhi_epi16(p[2], p[5]); 1429 1430 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 1431 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 1432 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); 1433 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); 1434 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); 1435 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); 1436 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 1437 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 1438 1439 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1440 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1441 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1442 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1443 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1444 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1445 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1446 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1447 1448 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1449 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1450 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1451 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1452 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1453 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1454 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1455 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1456 1457 t[1] = _mm_packs_epi32(v[0], v[1]); 1458 t[2] = _mm_packs_epi32(v[2], v[3]); 1459 t[5] = _mm_packs_epi32(v[4], v[5]); 1460 t[6] = _mm_packs_epi32(v[6], v[7]); 1461 1462 // stage 5 1463 s[0] = _mm_add_epi16(p[0], t[1]); 1464 s[1] = _mm_sub_epi16(p[0], t[1]); 1465 s[2] = _mm_add_epi16(p[3], t[2]); 1466 s[3] = _mm_sub_epi16(p[3], t[2]); 1467 s[4] = _mm_sub_epi16(p[4], t[5]); 1468 s[5] = _mm_add_epi16(p[4], t[5]); 1469 s[6] = _mm_sub_epi16(p[7], t[6]); 1470 s[7] = _mm_add_epi16(p[7], t[6]); 1471 1472 // stage 6 1473 u[0] = _mm_unpacklo_epi16(s[0], s[7]); 1474 u[1] = _mm_unpackhi_epi16(s[0], s[7]); 1475 u[2] = _mm_unpacklo_epi16(s[1], s[6]); 1476 u[3] = _mm_unpackhi_epi16(s[1], s[6]); 1477 u[4] = _mm_unpacklo_epi16(s[2], s[5]); 1478 u[5] = _mm_unpackhi_epi16(s[2], s[5]); 1479 u[6] = _mm_unpacklo_epi16(s[3], s[4]); 1480 u[7] = _mm_unpackhi_epi16(s[3], s[4]); 1481 1482 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 1483 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 1484 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 1485 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 1486 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 1487 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 1488 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 1489 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 1490 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 1491 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 1492 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 1493 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 1494 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 1495 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 1496 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 1497 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 1498 1499 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1500 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1501 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1502 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1503 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1504 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1505 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1506 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1507 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1508 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1509 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1510 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1511 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1512 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1513 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1514 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1515 1516 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1517 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1518 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1519 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1520 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1521 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1522 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1523 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1524 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1525 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1526 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1527 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1528 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1529 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1530 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1531 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1532 1533 in[1] = _mm_packs_epi32(v[0], v[1]); 1534 in[9] = _mm_packs_epi32(v[2], v[3]); 1535 in[5] = _mm_packs_epi32(v[4], v[5]); 1536 in[13] = _mm_packs_epi32(v[6], v[7]); 1537 in[3] = _mm_packs_epi32(v[8], v[9]); 1538 in[11] = _mm_packs_epi32(v[10], v[11]); 1539 in[7] = _mm_packs_epi32(v[12], v[13]); 1540 in[15] = _mm_packs_epi32(v[14], v[15]); 1541} 1542 1543static void fadst16_8col(__m128i *in) { 1544 // perform 16x16 1-D ADST for 8 columns 1545 __m128i s[16], x[16], u[32], v[32]; 1546 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1547 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1548 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1549 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1550 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1551 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1552 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1553 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1554 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1555 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1556 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1557 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1558 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1559 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1560 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1561 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1562 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1563 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1564 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1565 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1566 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1567 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1568 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1569 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1570 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1571 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); 1572 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1573 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1574 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1575 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1576 const __m128i kZero = _mm_set1_epi16(0); 1577 1578 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1579 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1580 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1581 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1582 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1583 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1584 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1585 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1586 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1587 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1588 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1589 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1590 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1591 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1592 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1593 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1594 1595 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1596 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1597 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1598 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1599 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1600 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1601 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1602 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1603 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1604 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1605 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1606 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1607 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1608 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1609 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1610 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1611 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1612 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1613 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1614 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1615 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1616 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1617 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1618 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1619 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1620 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1621 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1622 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1623 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1624 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1625 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1626 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1627 1628 u[0] = _mm_add_epi32(v[0], v[16]); 1629 u[1] = _mm_add_epi32(v[1], v[17]); 1630 u[2] = _mm_add_epi32(v[2], v[18]); 1631 u[3] = _mm_add_epi32(v[3], v[19]); 1632 u[4] = _mm_add_epi32(v[4], v[20]); 1633 u[5] = _mm_add_epi32(v[5], v[21]); 1634 u[6] = _mm_add_epi32(v[6], v[22]); 1635 u[7] = _mm_add_epi32(v[7], v[23]); 1636 u[8] = _mm_add_epi32(v[8], v[24]); 1637 u[9] = _mm_add_epi32(v[9], v[25]); 1638 u[10] = _mm_add_epi32(v[10], v[26]); 1639 u[11] = _mm_add_epi32(v[11], v[27]); 1640 u[12] = _mm_add_epi32(v[12], v[28]); 1641 u[13] = _mm_add_epi32(v[13], v[29]); 1642 u[14] = _mm_add_epi32(v[14], v[30]); 1643 u[15] = _mm_add_epi32(v[15], v[31]); 1644 u[16] = _mm_sub_epi32(v[0], v[16]); 1645 u[17] = _mm_sub_epi32(v[1], v[17]); 1646 u[18] = _mm_sub_epi32(v[2], v[18]); 1647 u[19] = _mm_sub_epi32(v[3], v[19]); 1648 u[20] = _mm_sub_epi32(v[4], v[20]); 1649 u[21] = _mm_sub_epi32(v[5], v[21]); 1650 u[22] = _mm_sub_epi32(v[6], v[22]); 1651 u[23] = _mm_sub_epi32(v[7], v[23]); 1652 u[24] = _mm_sub_epi32(v[8], v[24]); 1653 u[25] = _mm_sub_epi32(v[9], v[25]); 1654 u[26] = _mm_sub_epi32(v[10], v[26]); 1655 u[27] = _mm_sub_epi32(v[11], v[27]); 1656 u[28] = _mm_sub_epi32(v[12], v[28]); 1657 u[29] = _mm_sub_epi32(v[13], v[29]); 1658 u[30] = _mm_sub_epi32(v[14], v[30]); 1659 u[31] = _mm_sub_epi32(v[15], v[31]); 1660 1661 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1662 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1663 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1664 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1665 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1666 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1667 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1668 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1669 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1670 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1671 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1672 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1673 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1674 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1675 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1676 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1677 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1678 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1679 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1680 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1681 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1682 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1683 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1684 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1685 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1686 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1687 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1688 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1689 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1690 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1691 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1692 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1693 1694 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1695 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1696 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1697 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1698 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1699 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1700 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1701 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1702 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1703 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1704 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1705 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1706 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1707 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1708 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1709 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1710 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1711 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1712 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1713 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1714 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1715 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1716 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1717 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1718 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1719 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1720 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1721 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1722 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1723 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1724 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1725 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1726 1727 s[0] = _mm_packs_epi32(u[0], u[1]); 1728 s[1] = _mm_packs_epi32(u[2], u[3]); 1729 s[2] = _mm_packs_epi32(u[4], u[5]); 1730 s[3] = _mm_packs_epi32(u[6], u[7]); 1731 s[4] = _mm_packs_epi32(u[8], u[9]); 1732 s[5] = _mm_packs_epi32(u[10], u[11]); 1733 s[6] = _mm_packs_epi32(u[12], u[13]); 1734 s[7] = _mm_packs_epi32(u[14], u[15]); 1735 s[8] = _mm_packs_epi32(u[16], u[17]); 1736 s[9] = _mm_packs_epi32(u[18], u[19]); 1737 s[10] = _mm_packs_epi32(u[20], u[21]); 1738 s[11] = _mm_packs_epi32(u[22], u[23]); 1739 s[12] = _mm_packs_epi32(u[24], u[25]); 1740 s[13] = _mm_packs_epi32(u[26], u[27]); 1741 s[14] = _mm_packs_epi32(u[28], u[29]); 1742 s[15] = _mm_packs_epi32(u[30], u[31]); 1743 1744 // stage 2 1745 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1746 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1747 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1748 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1749 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1750 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1751 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1752 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1753 1754 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1755 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1756 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1757 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1758 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1759 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1760 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1761 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1762 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1763 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1764 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1765 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1766 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1767 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1768 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1769 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1770 1771 u[0] = _mm_add_epi32(v[0], v[8]); 1772 u[1] = _mm_add_epi32(v[1], v[9]); 1773 u[2] = _mm_add_epi32(v[2], v[10]); 1774 u[3] = _mm_add_epi32(v[3], v[11]); 1775 u[4] = _mm_add_epi32(v[4], v[12]); 1776 u[5] = _mm_add_epi32(v[5], v[13]); 1777 u[6] = _mm_add_epi32(v[6], v[14]); 1778 u[7] = _mm_add_epi32(v[7], v[15]); 1779 u[8] = _mm_sub_epi32(v[0], v[8]); 1780 u[9] = _mm_sub_epi32(v[1], v[9]); 1781 u[10] = _mm_sub_epi32(v[2], v[10]); 1782 u[11] = _mm_sub_epi32(v[3], v[11]); 1783 u[12] = _mm_sub_epi32(v[4], v[12]); 1784 u[13] = _mm_sub_epi32(v[5], v[13]); 1785 u[14] = _mm_sub_epi32(v[6], v[14]); 1786 u[15] = _mm_sub_epi32(v[7], v[15]); 1787 1788 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1789 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1790 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1791 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1792 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1793 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1794 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1795 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1796 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1797 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1798 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1799 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1800 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1801 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1802 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1803 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1804 1805 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1806 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1807 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1808 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1809 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1810 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1811 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1812 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1813 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1814 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1815 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1816 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1817 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1818 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1819 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1820 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1821 1822 x[0] = _mm_add_epi16(s[0], s[4]); 1823 x[1] = _mm_add_epi16(s[1], s[5]); 1824 x[2] = _mm_add_epi16(s[2], s[6]); 1825 x[3] = _mm_add_epi16(s[3], s[7]); 1826 x[4] = _mm_sub_epi16(s[0], s[4]); 1827 x[5] = _mm_sub_epi16(s[1], s[5]); 1828 x[6] = _mm_sub_epi16(s[2], s[6]); 1829 x[7] = _mm_sub_epi16(s[3], s[7]); 1830 x[8] = _mm_packs_epi32(u[0], u[1]); 1831 x[9] = _mm_packs_epi32(u[2], u[3]); 1832 x[10] = _mm_packs_epi32(u[4], u[5]); 1833 x[11] = _mm_packs_epi32(u[6], u[7]); 1834 x[12] = _mm_packs_epi32(u[8], u[9]); 1835 x[13] = _mm_packs_epi32(u[10], u[11]); 1836 x[14] = _mm_packs_epi32(u[12], u[13]); 1837 x[15] = _mm_packs_epi32(u[14], u[15]); 1838 1839 // stage 3 1840 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1841 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1842 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1843 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1844 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1845 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1846 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1847 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1848 1849 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1850 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1851 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1852 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1853 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1854 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1855 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1856 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1857 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1858 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1859 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1860 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1861 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1862 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1863 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1864 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1865 1866 u[0] = _mm_add_epi32(v[0], v[4]); 1867 u[1] = _mm_add_epi32(v[1], v[5]); 1868 u[2] = _mm_add_epi32(v[2], v[6]); 1869 u[3] = _mm_add_epi32(v[3], v[7]); 1870 u[4] = _mm_sub_epi32(v[0], v[4]); 1871 u[5] = _mm_sub_epi32(v[1], v[5]); 1872 u[6] = _mm_sub_epi32(v[2], v[6]); 1873 u[7] = _mm_sub_epi32(v[3], v[7]); 1874 u[8] = _mm_add_epi32(v[8], v[12]); 1875 u[9] = _mm_add_epi32(v[9], v[13]); 1876 u[10] = _mm_add_epi32(v[10], v[14]); 1877 u[11] = _mm_add_epi32(v[11], v[15]); 1878 u[12] = _mm_sub_epi32(v[8], v[12]); 1879 u[13] = _mm_sub_epi32(v[9], v[13]); 1880 u[14] = _mm_sub_epi32(v[10], v[14]); 1881 u[15] = _mm_sub_epi32(v[11], v[15]); 1882 1883 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1884 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1885 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1886 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1887 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1888 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1889 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1890 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1891 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1892 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1893 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1894 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1895 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1896 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1897 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1898 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1899 1900 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1901 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1902 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1903 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1904 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1905 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1906 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1907 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1908 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1909 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1910 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1911 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1912 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1913 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1914 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1915 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1916 1917 s[0] = _mm_add_epi16(x[0], x[2]); 1918 s[1] = _mm_add_epi16(x[1], x[3]); 1919 s[2] = _mm_sub_epi16(x[0], x[2]); 1920 s[3] = _mm_sub_epi16(x[1], x[3]); 1921 s[4] = _mm_packs_epi32(v[0], v[1]); 1922 s[5] = _mm_packs_epi32(v[2], v[3]); 1923 s[6] = _mm_packs_epi32(v[4], v[5]); 1924 s[7] = _mm_packs_epi32(v[6], v[7]); 1925 s[8] = _mm_add_epi16(x[8], x[10]); 1926 s[9] = _mm_add_epi16(x[9], x[11]); 1927 s[10] = _mm_sub_epi16(x[8], x[10]); 1928 s[11] = _mm_sub_epi16(x[9], x[11]); 1929 s[12] = _mm_packs_epi32(v[8], v[9]); 1930 s[13] = _mm_packs_epi32(v[10], v[11]); 1931 s[14] = _mm_packs_epi32(v[12], v[13]); 1932 s[15] = _mm_packs_epi32(v[14], v[15]); 1933 1934 // stage 4 1935 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1936 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1937 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1938 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1939 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1940 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1941 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1942 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1943 1944 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1945 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1946 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1947 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1948 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1949 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1950 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1951 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1952 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1953 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1954 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1955 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1956 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1957 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1958 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1959 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1960 1961 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1962 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1963 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1964 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1965 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1966 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1967 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1968 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1969 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1970 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1971 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1972 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1973 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1974 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1975 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1976 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1977 1978 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1979 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1980 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1981 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1982 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1983 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1984 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1985 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1986 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1987 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1988 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1989 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1990 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1991 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1992 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1993 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1994 1995 in[0] = s[0]; 1996 in[1] = _mm_sub_epi16(kZero, s[8]); 1997 in[2] = s[12]; 1998 in[3] = _mm_sub_epi16(kZero, s[4]); 1999 in[4] = _mm_packs_epi32(v[4], v[5]); 2000 in[5] = _mm_packs_epi32(v[12], v[13]); 2001 in[6] = _mm_packs_epi32(v[8], v[9]); 2002 in[7] = _mm_packs_epi32(v[0], v[1]); 2003 in[8] = _mm_packs_epi32(v[2], v[3]); 2004 in[9] = _mm_packs_epi32(v[10], v[11]); 2005 in[10] = _mm_packs_epi32(v[14], v[15]); 2006 in[11] = _mm_packs_epi32(v[6], v[7]); 2007 in[12] = s[5]; 2008 in[13] = _mm_sub_epi16(kZero, s[13]); 2009 in[14] = s[9]; 2010 in[15] = _mm_sub_epi16(kZero, s[1]); 2011} 2012 2013static void fdct16_sse2(__m128i *in0, __m128i *in1) { 2014 fdct16_8col(in0); 2015 fdct16_8col(in1); 2016 array_transpose_16x16(in0, in1); 2017} 2018 2019static void fadst16_sse2(__m128i *in0, __m128i *in1) { 2020 fadst16_8col(in0); 2021 fadst16_8col(in1); 2022 array_transpose_16x16(in0, in1); 2023} 2024 2025void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, 2026 int stride, int tx_type) { 2027 __m128i in0[16], in1[16]; 2028 2029 switch (tx_type) { 2030 case DCT_DCT: 2031 vpx_fdct16x16_sse2(input, output, stride); 2032 break; 2033 case ADST_DCT: 2034 load_buffer_16x16(input, in0, in1, stride); 2035 fadst16_sse2(in0, in1); 2036 right_shift_16x16(in0, in1); 2037 fdct16_sse2(in0, in1); 2038 write_buffer_16x16(output, in0, in1, 16); 2039 break; 2040 case DCT_ADST: 2041 load_buffer_16x16(input, in0, in1, stride); 2042 fdct16_sse2(in0, in1); 2043 right_shift_16x16(in0, in1); 2044 fadst16_sse2(in0, in1); 2045 write_buffer_16x16(output, in0, in1, 16); 2046 break; 2047 case ADST_ADST: 2048 load_buffer_16x16(input, in0, in1, stride); 2049 fadst16_sse2(in0, in1); 2050 right_shift_16x16(in0, in1); 2051 fadst16_sse2(in0, in1); 2052 write_buffer_16x16(output, in0, in1, 16); 2053 break; 2054 default: 2055 assert(0); 2056 break; 2057 } 2058} 2059